You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
116 lines
3.0 KiB
Perl
116 lines
3.0 KiB
Perl
#!/usr/bin/perl -w
|
|
|
|
use File::Find;
|
|
use Encode;
|
|
|
|
find(\&filehandler, "de");
|
|
|
|
sub filehandler {
|
|
if(/\.html$/) { #damit keine link.list dateien zerstört werden
|
|
open(BLUB, "$_");
|
|
@lines = <BLUB>;
|
|
close(BLUB);
|
|
$i=0;
|
|
$title="";
|
|
$beginning=0;
|
|
$ending=0;
|
|
while(!$title && $i<=$#lines) {
|
|
($title) = $lines[$i] =~ /<h1.*?>(.+)<\/h1>$/;
|
|
$i++
|
|
}
|
|
|
|
if(!$title) { #TODO detect if $title == "0"
|
|
print "title not found in $File::Find::name\n";
|
|
}
|
|
|
|
#Beginn suchen
|
|
while(!$beginning && $i<=$#lines) {
|
|
if($lines[$i] =~ /<!-- start content -->/) {
|
|
$beginning = $i;
|
|
}
|
|
$i++
|
|
}
|
|
#Ende suchen
|
|
while(!$ending && $i<=$#lines) {
|
|
if($lines[$i] =~ s/<!-- end content -->//) {
|
|
$ending = $i;
|
|
}
|
|
$i++
|
|
}
|
|
#ersetzten
|
|
splice(@lines,$ending+1, $#lines-$ending, "");
|
|
splice(@lines,0,$beginning+1, "$title\n");
|
|
#vorletzte zeile löschen da diese seperat angefügt werden wird
|
|
splice(@lines,$#lines-2, 1, "");
|
|
|
|
$i=0;
|
|
while($i<=$#lines) {
|
|
#a *very* dirty way to get rid of unicode chars in URLs
|
|
$lines[$i] =~ s/%([0-9A-F]{2})/chr(hex($1))/eg;
|
|
|
|
#needed for proper uppercase detection
|
|
$lines[$i] = decode_utf8($lines[$i]);
|
|
|
|
#removing double spaces
|
|
$lines[$i] =~ s/[ ]{2,}//g;
|
|
|
|
#removing tabs
|
|
$lines[$i] =~ s/\t//g;
|
|
|
|
#removing empty lines
|
|
$lines[$i] =~ s/^\n$//g;
|
|
|
|
#removing the comment block on the end of some aricles
|
|
if($lines[$i] =~ /^<!-- $/) {
|
|
if($lines[$i+5] =~ /^-->$/) {
|
|
splice(@lines, $i, 6, "");
|
|
}
|
|
}
|
|
|
|
#removing editsection links
|
|
$lines[$i] =~ s/<span class=\"editsection\">.+?<\/span> //g;
|
|
|
|
#converting tex images to tex inside of code tags
|
|
$lines[$i] =~ s/<img class=\"tex\" alt=\"(.*?)\".*?\/>/<code>$1<\/code>/g;
|
|
|
|
#delete all title attributes
|
|
$lines[$i] =~ s/ title=\".*?\"//g;
|
|
|
|
#remove rel attributes
|
|
$lines[$i] =~ s/ rel=\"nofollow\"//g;
|
|
|
|
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
|
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?\p{Lu}+[^\"]*?)_[a-f0-9]{4}\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
|
|
|
|
#all remaining links only need to be cleaned up
|
|
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?)\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
|
|
|
|
#delete all thumbnail boxes
|
|
if($lines[$i] =~ /<div class=\"thumb t(right|left)\">/) {
|
|
splice(@lines,$i,7, "");
|
|
}
|
|
|
|
#delete all spans
|
|
$lines[$i] =~ s/<span .*?>(.*?)<\/span>/$1/g;
|
|
|
|
#delete all class and style attr.
|
|
$lines[$i] =~ s/ class=\".*?\"//g;
|
|
$lines[$i] =~ s/ style=\".*?\"//g;
|
|
|
|
#delete alle remaining images
|
|
$lines[$i] =~ s/<a .*?><img .*?\/><\/a>//g;
|
|
$lines[$i] =~ s/<img .*?\/>//g;
|
|
|
|
$lines[$i] = encode_utf8($lines[$i]);
|
|
$i++;
|
|
}
|
|
|
|
#änderungen speichern
|
|
open(FILE, ">$_") or print "can't write to $File::Find::name\n";
|
|
print FILE @lines;
|
|
close(FILE);
|
|
}
|
|
}
|
|
|
|
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|