#!/bin/bash read REP; read tablo; read motif; echo "<html><head><title>tableaux de liens</title>" > $tablo; echo "<meta http-equiv=\"Content-Type\" content=\"text/html;charset=utf-8\" /> " >> $tablo; echo "</style>" >> $tablo; #Variable pour compter les tableaux j=1; for fic in `ls $REP` do echo "<table align=\"center\" border=\"2\"><tr><td colspan=\"9\" align=\"center\" bgcolor=\"darkmagenta\"><font color=\"white\"><b>Tableau $j</b></font></td></tr>" >> $tablo; echo "<tr><td align=\"center\" width=\"50\" bgcolor=\"mediumorchid\"><b>URL</b></td><td align=\"center\" width=\"100\" bgcolor=\"mediumorchid\"><b>Lien URL</b></td><td align=\"center\" width=\"100\" bgcolor=\"mediumorchid\"><b>Page Aspiree</b></td><td align=\"center\" width=\"20\" bgcolor=\"mediumorchid\"><b>Retour Curl</b></td><td align=\"center\" width=\"100\" bgcolor=\"mediumorchid\"><b>Dump Initial</b></td><td align=\"center\" width=\"100\" bgcolor=\"mediumorchid\"><b>Dump utf-8</b></td><td align=\"center\" width=\"100\" bgcolor=\"mediumorchid\"><b>Contexte</b></td><td align=\"center\" width=\"100\"bgcolor=\"mediumorchid\"><b>Contexte<br/>html</b></td><td align=\"center\" width=\"100\" bgcolor=\"mediumorchid\"><b>Nb Occur</b></td></tr>" >> $tablo; # Variable i pour compter les URLs i=1; mkdir ../PAGES-ASPIREES/$j; mkdir ../DUMP-TEXT/$j; mkdir ../CONTEXTES/$j; mkdir ../FICHIERGLOBAUX; for nom in `cat $REP/$fic` do echo "================================================================"; echo "TRAITEMENT : $nom "; #------- on CURL et on determine l'encodage ----------------------------------------- curl -o ../PAGES-ASPIREES/$j/$i.html $nom; retourcurl=$?; controlcurl=$(egrep -i -o "(400 )?Bad request|Moved Permanently|s interdit|Not Acceptable|404 Not Found|Service Unvailable|The Document has moved" ../PAGES-ASPIREES/$j/$i.html | sort -u); if [ "$controlcurl" != "" ] then retourcurl="$controlcurl"; fi echo "Retour curl = $retoutcurl"; if [ "$retourcurl" -eq "0" ] then encodage=$(file -i ../PAGES-ASPIREES/$j/$i.html | cut -f2 -d=); echo "ENCODAGE initial : $encodage"; #------- on continue en tenant compte de l'encodage fourni par curl--------------------- if [ "$encodage" == "utf-8" ] then lynx -dump -nolist -display_charset=$encodage ../PAGES-ASPIREES/$j/$i.html > ../DUMP-TEXT/$j/$i.txt; iconv -f $encodage -t utf-8 ../DUMP-TEXT/$j/$i.txt > ../DUMP-TEXT/$j/$i-utf8.txt; egrep -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt > ../CONTEXTES/$j/$i-utf8.txt; egrep -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt > ../CONTEXTES/$j/$i-utf8.html; nbOccur=??; nbOccur=$(egrep -o -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt | wc -l ); echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\"> - </td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TEXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\" ><a href=\"../CONTEXTES/$j/$i-utf8.html\">$i-utf8.html</a></td><td align=\"center\"><font color=\"darkmagenta\"><b>$nbOccur</b></font></td></tr>" >> $tablo; cat ../CONTEXTES/$j/$i-utf8.txt >> ../FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt; cat ../DUMP-TEXT/$j/$i-utf8.txt >> ../FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt; else VERIFENCODAGEDANSICONV=$(iconv -l | egrep -io $encodage | sort -u); if [ "$VERIFENCODAGEDANSICONV" != "" ] then # le charset extrait est connu de iconv : on lynxe et on dump echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> connu par inconv, c'est parti ==> lynx, iconv..." lynx -dump -nolist -display_charset=$encodage ../PAGES-ASPIREES/$j/$i.html > ../DUMP-TEXT/$j/$i.txt; echo "ENCODAGE final : $encodage (avant conversion vers utf-8)"; iconv -f $encodage -t utf-8 ../DUMP-TEXT/$j/$i.txt > ../DUMP-TEXT/$j/$i-utf8.txt; egrep -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt > ../CONTEXTES/$j/$i-utf8.txt; egrep -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt > ../CONTEXTES/$j/$i-utf8.html; nbOccur=??; nbOccur=$(egrep -o -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt | wc -l ); echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TEXT/$j/$i.txt\">$i.txt</a><br/><small>($encodage)</small></td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TEXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.html\">$i-utf8.html</a></td><td align=\"center\"><font color=\"darkmagenta\"><b>$nbOccur</b></font></td></tr>" >> $tablo; cat ../CONTEXTES/$j/$i-utf8.txt >> ../FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt; cat ../DUMP-TEXT/$j/$i-utf8.txt >> ../FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt; else # la page n'est pas en utf-8 et son encodage detecte par file n'est pas connu de iconv, on cherche un charset if egrep q-i ".*charset ?=.*" ../PAGES-ASPIREES/$j/$i.html; then # on a peut-etre trouve un charset..... echo "Presence d'un charset..."; charsetx=$(egrep -m 1 "charset=.*\"" ../PAGES-ASPIREES/$j/$i.html) charset=$("$charsetx": '.*=\(.*|)\"'); echo "charset extrait : $charsetx "; encodage=$(egrep -m 1 -o -i 'charset *= *[^\"]+' ../PAGES-ASPIREES/$j/$i.html | sort -u | cut -f2 -d=); echo "charset extrait : $encodage "; # avant de continuer on va vérifier si le charset est connu de iconv VERIFENCODAGEDANSICONV=$(iconv -l | egrep -io $encodage | sort -u); if [ "$VERIFENCODAGEDANSICONV" == "" ] then # le charset n'est pas connu de iconv : on fait rien.... echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> inconnu par inconv, on ne fait rien" echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"gold\"><small>Encodage<br/>non détecté</small></td><td align=\"center\" width=\"100\" bgcolor=\"gold\"><small>Encodage<br/>non détecté</small></td><td align=\"center\" width=\"100\" bgcolor=\"gold\"><small>Encodage<br/>non détecté</small></td><td align=\"center\"> - </td><td align=\"center\"> - </td><td align=\"center\"> - </td></tr>" >> $tablo; else # le charset extrait est connu de iconv : on lynxe et on dump echo "VERIF : <$VERIFENCODAGEDANSICONV> ==> connu par inconv, c'est parti ==> lynx, iconv..." lynx -dump -nolist -display_charset=$encodage ../PAGES-ASPIREES/$j/$i.html > ../DUMP-TEXT/$j/$i.txt; echo "ENCODAGE final : $encodage (avant conversion vers utf-8)"; iconv -f $encodage -t utf-8 ../DUMP-TEXT/$j/$i.txt > ../DUMP-TEXT/$j/$i-utf8.txt; egrep -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt > ../CONTEXTES/$j/$i-utf8.txt; egrep -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt > ../CONTEXTES/$j/$i-utf8.html; nbOccur=??; nbOccur=$(egrep -o -i "\b$motif\b" ../DUMP-TEXT/$j/$i-utf8.txt | wc -l ); echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TEXT/$j/$i.txt\">$i.txt</a><br/><small>($encodage)</small></td><td align=\"center\" width=\"100\"><a href=\"../DUMP-TEXT/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.txt\">$i-utf8.txt</a></td><td align=\"center\" width=\"100\"><a href=\"../CONTEXTES/$j/$i-utf8.html\">$i-utf8.html</a></td><td align=\"center\"><font color=\"darkmagenta\"><b>$nbOccur</b></font></td></tr>" >> $tablo; cat ../CONTEXTES/$j/$i-utf8.txt >> ../FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt; cat ../DUMP-TEXT/$j/$i-utf8.txt >> ../FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt; fi else echo "Pas de charset detecte : on ne fait rien pour le DUMP... "; echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"><a href=\"../PAGES-ASPIREES/$j/$i.html\">$i.html</a></td><td align=\"center\" width=\"20\">$retourcurl</td><td align=\"center\" width=\"100\" bgcolor=\"gold\"><small>Encodage<br/>non détecté</small></td><td align=\"center\" width=\"100\" bgcolor=\"gold\"><small>Encodage<br/>non détecté</small></td><td align=\"center\"> - </td><td align=\"center\"> - </td><td align=\"center\"> - </td></tr>" >> $tablo; fi fi fi # fin curl else echo "<tr><td align=\"center\" width=\"50\">$i</td><td align=\"center\" width=\"100\"><a href=\"$nom\">$nom</a></td><td align=\"center\" width=\"100\"> - </td><td align=\"center\" width=\"20\" bgcolor=\"lightcoral\">$retourcurl</td><td align=\"center\" width=\"100\"> - </td><td align=\"center\" width=\"100\"> - </td><td align=\"center\" width=\"100\"> - </td><td align=\"center\" width=\"100\"> - </td><td align=\"center\" width=\"100\"> - </td>" >> $tablo; fi i=$((i+=1)); done echo "<tr><td align=\"center\" colspan=\"5\"> </td><td align=\"center\" width=\"100\"><a href="../FICHIERGLOBAUX/DUMP-GLOBAUX_$j.txt">Fichier DUMP<br/>global</a><br/></td><td align=\"center\" width=\"100\"><a href="../FICHIERGLOBAUX/CONTEXTES-GLOBAUX_$j.txt">Fichier CONTEXTES<br/>global</a><br/></td><td align=\"center\"colspan=\"2\"> </td></tr>" >> $tablo; echo "</table>" >> $tablo; j=$((j+=1)); echo "<p align=\"center\"><hr color=\"mediumorchid\" width=\"50%\"/></p>" >> $tablo; done echo "</body></html>" >> $tablo;