#!/usr/bin/perl -w use File::Find; use Encode; find(\&filehandler, "de"); sub filehandler { if(/\.html$/) { #damit keine link.list dateien zerstört werden open(BLUB, "$_"); @lines = ; close(BLUB); $i=0; $title=""; $beginning=0; $ending=0; while(!$title && $i<=$#lines) { ($title) = $lines[$i] =~ /(.+)<\/h1>$/; $i++ } if(!$title) { #TODO detect if $title == "0" print "title not found in $File::Find::name\n"; } #Beginn suchen while(!$beginning && $i<=$#lines) { if($lines[$i] =~ //) { $beginning = $i; } $i++ } #Ende suchen while(!$ending && $i<=$#lines) { if($lines[$i] =~ s///) { $ending = $i; } $i++ } #ersetzten splice(@lines,$ending+1, $#lines-$ending, ""); splice(@lines,0,$beginning+1, "$title\n"); #vorletzte zeile löschen da diese seperat angefügt werden wird splice(@lines,$#lines-2, 1, ""); $i=0; while($i<=$#lines) { #a *very* dirty way to get rid of unicode chars in URLs $lines[$i] =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #needed for proper uppercase detection $lines[$i] = decode_utf8($lines[$i]); #removing double spaces $lines[$i] =~ s/[ ]{2,}//g; #removing tabs $lines[$i] =~ s/\t//g; #removing empty lines $lines[$i] =~ s/^\n$//g; #removing the comment block on the end of some aricles if($lines[$i] =~ /^$/) { splice(@lines, $i, 6, ""); } } #removing editsection links $lines[$i] =~ s/.+?<\/span> //g; #converting tex images to tex inside of code tags $lines[$i] =~ s/\"(.*?)\".*?\//$1<\/code>/g; #delete all title attributes $lines[$i] =~ s/ title=\".*?\"//g; #remove rel attributes $lines[$i] =~ s/ rel=\"nofollow\"//g; #if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed $lines[$i] =~ s///g; #all remaining links only need to be cleaned up $lines[$i] =~ s///g; #delete all thumbnail boxes if($lines[$i] =~ /