initial commit

main
josch 10 years ago
commit 2329e4a4df

@ -0,0 +1,44 @@
Be careful! What you do is only for testing and submitting bugs!
Some important features may be correctly implemented!
mail me: j [dot] schauer [at] email [dot] de
All code is released unter GPLv3 or later
1. install mksquashfs-tools
on ubuntu you do this via
sudo apt-get install squashfs-tools
2. Obtain an wikipedia html dump from static.wikipedia.org
be careful! the august 2007 version of the english wikipdia is 120GB extracted!!
3. extract it to the folder where your scripts are located
7zr x wikipedia-de-html.7z
your directory structure for eg. german wikipedia dump (26GB extracted) should look like this:
de/ <= all your extracted files
README <= this file
remove_files.sh <= bash script for removing unnecesarry files like talk pages
create_linkindex.pl <= perl script for creating a links.list in eayh directory
remove_everything.pl <= clean up all remainig junk in the dump files
rename.pl <= rename everything
4. edit all scripts for your language!
heavy editing is to be done on remove_files.sh
in the perl scripts you have to change this line according to your language folder:
find(\&filehandler, "de");
5. delete unnecessary files and dirs in your dump
rm -r de/COPYING.html de/index.html de/skins/ de/raw/ de/images/ de/upload/
6. run remove_files.sh
7. run create_linkindex.pl
8. now delete all links with
find de/ -size -2k -type f -name '*.html' -delete
9. run remove_everything.pl
10. run rename.pl
11. create the image file with
mksquashfs /de your_image_name
12. test it and drop me a line if sth. went wrong!

@ -0,0 +1,47 @@
#!/usr/bin/perl -w
use File::Find;
use Encode;
find(\&filehandler, "de");
sub filehandler {
if(-s $_ < 2048 and /\.html$/) {
open(BLUB, $_);
@lines = <BLUB>;
close(BLUB);
if($#lines < 4) {
print "file too small: $File::Find::name\n";
} else {
if(($href) = $lines[4] =~ /<meta http-equiv=\"Refresh\" content=\"0;url=..\/..\/..\/.{1,9}\/.{1,9}\/.{1,9}\/(.*?\.html)\" \/>/) {
$href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri
$href = decode_utf8($href);
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
unless($href =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
#if this did not match it's all lowercase and has no hex to be removed
$href =~ s/^(.+?)\.html/$1/;
}
$href = encode_utf8($href);
$_ = decode_utf8($_);
#if a filename has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
unless($_ =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
#if this did not match it's all lowercase and has no hex to be removed
$_ =~ s/^(.+?)\.html$/$1/;
}
$_ = encode_utf8($_);
$links = $ENV{PWD} . "/" . $File::Find::dir . "/links.list";
open(LIST, ">>$links");
print LIST "$_ $href\n";
close(LIST);
} else {
print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n";
}
}
}
}
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName

Binary file not shown.

@ -0,0 +1,78 @@
#!/usr/bin/python
import gtk
import bz2
import gtkhtml2
import time
class Mokopedia:
document = gtkhtml2.Document()
def delete_event(self, widget, event, data=None):
gtk.main_quit()
return False
def __init__(self):
self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
self.window.set_title("Mokopedia")
self.window.connect("delete_event", self.delete_event)
self.window.set_border_width(0)
self.window.set_default_size(480,640)
main_box = gtk.VBox()
search_box = gtk.HBox()
search_entry = gtk.Entry()
search_box.add(search_entry)
search_btn = gtk.Button("Search")
search_btn.connect("clicked", self.search)
search_box.pack_start(search_btn, False, False, 0)
#document.connect('request_url', request_url)
#document.connect('link_clicked', link_clicked)
self.displayarticle("Alexander_the_Great")
view = gtkhtml2.View()
view.set_document(self.document)
sw = gtk.ScrolledWindow()
sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_ALWAYS)
sw.add(view)
main_box.pack_start(search_box, False, False, 0)
main_box.add(sw)
self.window.add(main_box)
self.window.show_all()
def search(self, widget):
print "blubber"
def displayarticle(self, title):
self.document.clear()
self.document.open_stream('text/html')
before = time.time()
#f = open(title + ".html.bz2")
f = open("Alexander_the_Great (another copy).html")
#self.document.write_stream(bz2.decompress(f.read()))
self.document.write_stream(f.read())
f.close()
print time.time() - before
self.document.close_stream()
def main():
gtk.gdk.threads_init()
gtk.gdk.threads_enter()
gtk.main()
gtk.gdk.threads_leave()
if (__name__ == '__main__'):
Mokopedia = Mokopedia()
main()

@ -0,0 +1,24 @@
#!/usr/bin/perl -w
use File::Find;
use Encode;
find(\&filehandler, "de");
sub filehandler {
if(-f $_ and $_ !~ /links\.list/) { #damit keine link.list dateien zerstört werden
open(BLUB, "$_");
@lines = <BLUB>;
close(BLUB);
#Lazy...
$lines[$#lines] =~ s/<div id=\"catlinks\">.*//;
#änderungen speichern
open(FILE, ">$_") or print "can't write to $File::Find::name\n";
print FILE @lines;
close(FILE);
}
}
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName

@ -0,0 +1,115 @@
#!/usr/bin/perl -w
use File::Find;
use Encode;
find(\&filehandler, "de");
sub filehandler {
if(/\.html$/) { #damit keine link.list dateien zerstört werden
open(BLUB, "$_");
@lines = <BLUB>;
close(BLUB);
$i=0;
$title="";
$beginning=0;
$ending=0;
while(!$title && $i<=$#lines) {
($title) = $lines[$i] =~ /<h1.*?>(.+)<\/h1>$/;
$i++
}
if(!$title) { #TODO detect if $title == "0"
print "title not found in $File::Find::name\n";
}
#Beginn suchen
while(!$beginning && $i<=$#lines) {
if($lines[$i] =~ /<!-- start content -->/) {
$beginning = $i;
}
$i++
}
#Ende suchen
while(!$ending && $i<=$#lines) {
if($lines[$i] =~ s/<!-- end content -->//) {
$ending = $i;
}
$i++
}
#ersetzten
splice(@lines,$ending+1, $#lines-$ending, "");
splice(@lines,0,$beginning+1, "$title\n");
#vorletzte zeile löschen da diese seperat angefügt werden wird
splice(@lines,$#lines-2, 1, "");
$i=0;
while($i<=$#lines) {
#a *very* dirty way to get rid of unicode chars in URLs
$lines[$i] =~ s/%([0-9A-F]{2})/chr(hex($1))/eg;
#needed for proper uppercase detection
$lines[$i] = decode_utf8($lines[$i]);
#removing double spaces
$lines[$i] =~ s/[ ]{2,}//g;
#removing tabs
$lines[$i] =~ s/\t//g;
#removing empty lines
$lines[$i] =~ s/^\n$//g;
#removing the comment block on the end of some aricles
if($lines[$i] =~ /^<!-- $/) {
if($lines[$i+5] =~ /^-->$/) {
splice(@lines, $i, 6, "");
}
}
#removing editsection links
$lines[$i] =~ s/<span class=\"editsection\">.+?<\/span> //g;
#converting tex images to tex inside of code tags
$lines[$i] =~ s/<img class=\"tex\" alt=\"(.*?)\".*?\/>/<code>$1<\/code>/g;
#delete all title attributes
$lines[$i] =~ s/ title=\".*?\"//g;
#remove rel attributes
$lines[$i] =~ s/ rel=\"nofollow\"//g;
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?\p{Lu}+[^\"]*?)_[a-f0-9]{4}\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
#all remaining links only need to be cleaned up
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?)\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
#delete all thumbnail boxes
if($lines[$i] =~ /<div class=\"thumb t(right|left)\">/) {
splice(@lines,$i,7, "");
}
#delete all spans
$lines[$i] =~ s/<span .*?>(.*?)<\/span>/$1/g;
#delete all class and style attr.
$lines[$i] =~ s/ class=\".*?\"//g;
$lines[$i] =~ s/ style=\".*?\"//g;
#delete alle remaining images
$lines[$i] =~ s/<a .*?><img .*?\/><\/a>//g;
$lines[$i] =~ s/<img .*?\/>//g;
$lines[$i] = encode_utf8($lines[$i]);
$i++;
}
#änderungen speichern
open(FILE, ">$_") or print "can't write to $File::Find::name\n";
print FILE @lines;
close(FILE);
}
}
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName

@ -0,0 +1,40 @@
#!/bin/bash
#echo lösche \"Kategorie~*\"
#find de/ -type f -name "Kategorie~*" -delete
echo lösche \"Kategorie_Diskussion~*\"
find de/ -type f -name "Kategorie_Diskussion~*" -delete
echo lösche \"Bild~*\"
find de/ -type f -name "Bild~*" -delete
echo lösche \"Bild_Diskussion~*\"
find de/ -type f -name "Bild_Diskussion~*" -delete
echo lösche \"Portal~*\"
find de/ -type f -name "Portal~*" -delete
echo lösche \"Portal_Diskussion~*\"
find de/ -type f -name "Portal_Diskussion~*" -delete
echo lösche \"Diskussion~*\"
find de/ -type f -name "Diskussion~*" -delete
echo lösche \"Vorlage~*\"
find de/ -type f -name "Vorlage~*" -delete
echo lösche \"Vorlage_Diskussion~*\"
find de/ -type f -name "Vorlage_Diskussion~*" -delete
echo lösche \"Benutzer~*\"
find de/ -type f -name "Benutzer~*" -delete
echo lösche \"Benutzer_Diskussion~*\"
find de/ -type f -name "Benutzer_Diskussion~*" -delete
#echo lösche \"Spezial~*\"
#find de/ -type f -name "Spezial~*" -delete
#echo lösche \"Wikipedia~*\"
#find de/ -type f -name "Wikipedia~*" -delete
echo lösche \"Wikipedia_Diskussion~*\"
find de/ -type f -name "Wikipedia_Diskussion~*" -delete
echo lösche \"MediaWiki~*\"
find de/ -type f -name "MediaWiki~*" -delete
echo lösche \"MediaWiki_Diskussion~*\"
find de/ -type f -name "MediaWiki_Diskussion~*" -delete
echo lösche \"Hilfe~*\"
find de/ -type f -name "Hilfe~*" -delete
echo lösche \"Hilfe_Diskussion~*\"
find de/ -type f -name "Hilfe_Diskussion~*" -delete
echo lösche \"WP~*\"
find de/ -type f -name "WP~*" -delete

@ -0,0 +1,24 @@
#!/usr/bin/perl -w
use File::Find;
use Encode;
$pwd = $ENV{PWD};
find(\&filehandler, "de");
sub filehandler {
if(/\.html$/) {
$_ = decode_utf8($_);
if($_ =~ s/(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html/$1/) {
$_ = encode_utf8($_);
rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_");
} elsif($_ =~ s/(.+?)\.html/$1/) {
$_ = encode_utf8($_);
rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_");
} else {
print "couldn't find filename pattern in $File::Find::name\n";
}
}
}
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName

71
stats

@ -0,0 +1,71 @@
==originial p7z:
size: 2,124,093
dirs: 34125
files: 1724658
==entpackt:
size: 26,124,760
==löschung unnützer dateien:
size: 15,884,704
files: 1135661
**davon:
kaputt: 116
links: 454132
==hinzufügen der links.list files TODO: dateigröße nach unten
#create_linkindex.pl
size: 16,004,220
files: 1156229
==löschen der links:
#find de/ -size -2k -type f -name '*.html' -delete
size: 14,183,948
files: 701044
==extrahieren des inhalts
#extract_content.pl
size: 8575264
==whitespaces und kommentare entfernen
#clean_whitespaces_comments.pl
size: 8509992
==editsection einträge entfernen
#clean_edits.pl
size: 8197228
==title attribute entfernen
#clean_titles.pl
size: 7400248
==tex images umwandeln
#clean_tex.pl
size: 7395216
==links umwandeln TODO: mit clean titles zusammenlegen
#clean_links.pl
size: 6774260
==thumbnail boxen löschen
#clean_thumbnails.pl
size: 6515720
==spans, class attr, style attr. löschen
#clean_css_markup.pl
size: 5995296
==bilder löschen
#clean_images.pl
size: 5730456
==alle umbenennen
find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?[[:upper:]]+.*?)(_[a-f0-9]{4})\.html$/$1$2/' --
find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?)\.html$/$1$2/' --
in april: 664444 items, totalling 3.5 GB => 1GB
701044 => 1.1GB

@ -0,0 +1,7 @@
#!/usr/bin/perl -w
$test = "%6d%69%74%73%75%68%69%6b%6f%40%75%62%75%6e%74%75%2e%63%6f%6d";
$test =~ s/%([0-9a-f]{2})/chr(hex($1))/eg;
print $test;

@ -0,0 +1,9 @@
remove 11m
extract 110m
remove_files 22m
create_links -
delete_links 12m
remove_everyth 118m
remove_catlink 31m
rename 11m
mksquashfs 62m
Loading…
Cancel
Save