You cannot select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
48 lines
1.5 KiB
Perl
48 lines
1.5 KiB
Perl
10 years ago
|
#!/usr/bin/perl -w
|
||
|
|
||
|
use File::Find;
|
||
|
use Encode;
|
||
|
|
||
|
find(\&filehandler, "de");
|
||
|
|
||
|
sub filehandler {
|
||
|
if(-s $_ < 2048 and /\.html$/) {
|
||
|
open(BLUB, $_);
|
||
|
@lines = <BLUB>;
|
||
|
close(BLUB);
|
||
|
if($#lines < 4) {
|
||
|
print "file too small: $File::Find::name\n";
|
||
|
} else {
|
||
|
if(($href) = $lines[4] =~ /<meta http-equiv=\"Refresh\" content=\"0;url=..\/..\/..\/.{1,9}\/.{1,9}\/.{1,9}\/(.*?\.html)\" \/>/) {
|
||
|
|
||
|
$href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri
|
||
|
|
||
|
$href = decode_utf8($href);
|
||
|
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||
|
unless($href =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
|
||
|
#if this did not match it's all lowercase and has no hex to be removed
|
||
|
$href =~ s/^(.+?)\.html/$1/;
|
||
|
}
|
||
|
$href = encode_utf8($href);
|
||
|
|
||
|
$_ = decode_utf8($_);
|
||
|
#if a filename has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||
|
unless($_ =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
|
||
|
#if this did not match it's all lowercase and has no hex to be removed
|
||
|
$_ =~ s/^(.+?)\.html$/$1/;
|
||
|
}
|
||
|
$_ = encode_utf8($_);
|
||
|
|
||
|
$links = $ENV{PWD} . "/" . $File::Find::dir . "/links.list";
|
||
|
open(LIST, ">>$links");
|
||
|
print LIST "$_ $href\n";
|
||
|
close(LIST);
|
||
|
} else {
|
||
|
print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n";
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|