Script:
#!/bin/bash
tablo=../TABLEAUX/tablo.html;
rm -d -f -r ../PAGES-ASPIREES/*;
rm -d -f -r ../DUMP-TEXT/*;
rm -d -f -r
../CONTEXTES/*;
for fic in `ls ../URLS`
{
mkdir ../PAGES-ASPIREES/`basename
$fic .txt`;
mkdir ../DUMP-TEXT/`basename
$fic .txt`;
mkdir
../CONTEXTES/`basename $fic .txt`;
echo
"¤¤¤¤¤¤¤ CREATION DES SOUS-DOSSIERS POUR `basename $fic .txt` - OK... ¤¤¤¤¤¤¤¤"
}
echo
"<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Strict//EN"
"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd\">" > $tablo;
echo
"<html xmlns=\"http://www.w3.org/1999/xhtml\"
xml:lang=\"fr\" lang=\"fr\">"
>> $tablo;
echo
"<head>" >> $tablo;
echo -e
"\t<meta http-equiv=\"Content-Type\"
content=\"text/html; charset=utf-8\" />" >> $tablo;
echo -e
"\t<title>Tableau de liens</title>" >> $tablo;
echo -e
"\t<style type=\"text/css\">\n\t\tbody
{\n\t\t\tfont-family: Verdana, Arial, Helvetiva,
sans-serif;\n\t\t}\n\t</style>" >> $tablo;
echo
"</head>" >> $tablo;
echo
"<body>" >> $tablo;
j=0
for motif
in `cat ./motif.txt`
{
motiff[$j]="$motif";
let
"j+=1";
}
j=0
for langue in
`cat ./langues.txt`
{
lang[$j]="$langue";
let
"j+=1";
}
j=0
while
read definition
do
{
definitions[$j]="$definition";
let
"j+=1";
}
done <
./definitions.txt
j=1
k=1
for fic in `ls ../URLS`
{
name=`echo
$fic | sed -e
"s/[0-9].*//g"`;
if [
"$name" != "$namelast" ] ; then
{
if [ -n
"$namelast" ] ; then
{
echo
"</table>" >> $tablo;
}
fi
langue="${lang[$j]}";
motif="${motiff[$j]}";
echo
"<center><h2>$langue</h2>" >> $tablo;
echo
"<table border=1>" >> $tablo;
namelast="$name";
let
"j+=1";
}
fi
echo
"<tr><th colspan=\"4\">${definitions[$k]}</th></tr>" >>
$tablo;
echo
"<tr><td> Urls
</td><td> Pages aspirées
</td><td> Dump </td><td> Contexte
</td></tr>" >> $tablo;
i=1
for nom
in `cat ../URLS/$fic`
{
wget -O
../PAGES-ASPIREES/`basename $fic .txt`/$i.html $nom
# recuperation de
la valeur du charset de la page
charset=`egrep -o -i -m 1 "charset=([^
\";]+)" ../PAGES-ASPIREES/\`basename $fic .txt\`/$i.html | cut -f2 -d'='`;
echo
"charset " $charset;
# Si le charset
n'est pas vide, faire le traitement habituel : recodage, dump et filtrage
if [ -n
"$charset" ] && [ $charset != "utf-8" ] && [ $charset != "UTF-8" ] ; then
{
echo
"REMPLACEMENT DU CHARSET : $charset => UTF-8 !"
# Recodage des
pages en utf-8
lynx
-dump -nolist -display_charset=$charset ../PAGES-ASPIREES/`basename
$fic .txt`/$i.html | iconv
-f $charset -t utf-8 > ../DUMP-TEXT/`basename $fic .txt`/$i.txt
}
else
{
lynx
-dump -nolist -display_charset=$charset ../PAGES-ASPIREES/`basename
$fic .txt`/$i.html > ../DUMP-TEXT/`basename $fic .txt`/$i.txt;
}
fi
# supprimer les
lignes vides des textes dump
sed -i "/^$/d"
../DUMP-TEXT/`basename $fic
.txt`/$i.txt;
# recuperation du
motif de la page
egrep -i
"$motif" ../DUMP-TEXT/`basename $fic .txt`/$i.txt > ../CONTEXTES/`basename $fic .txt`/$i.txt
echo
"<tr><td>
<a href=\"$nom\">Lien $i</a> </td>" >> $tablo;
echo
"<td> <a href=\"../PAGES-ASPIREES/`basename $fic
.txt`/$i.html\">$i</a></td><td> <a href=\"../DUMP-TEXT/`basename $fic
.txt`/$i.txt\">$i</a><td> <a href=\"../CONTEXTES/`basename $fic .txt`/$i.txt\">$i</a></td></tr>" >> $tablo;
let
"i+=1";
}
((k+=1));
}
echo
"</table>" >> $tablo;
echo
"</body></html>" >> $tablo;
echo
"CREATION TABLEAU EST FINI"
|