commit 2329e4a4df59ba4c42210705693a06744b5e4181 Author: josch Date: Sat Jun 21 16:03:20 2014 +0200 initial commit diff --git a/README b/README new file mode 100644 index 0000000..5608bd2 --- /dev/null +++ b/README @@ -0,0 +1,44 @@ +Be careful! What you do is only for testing and submitting bugs! +Some important features may be correctly implemented! + +mail me: j [dot] schauer [at] email [dot] de + +All code is released unter GPLv3 or later + + +1. install mksquashfs-tools +on ubuntu you do this via +sudo apt-get install squashfs-tools + +2. Obtain an wikipedia html dump from static.wikipedia.org +be careful! the august 2007 version of the english wikipdia is 120GB extracted!! + +3. extract it to the folder where your scripts are located +7zr x wikipedia-de-html.7z + +your directory structure for eg. german wikipedia dump (26GB extracted) should look like this: + +de/ <= all your extracted files +README <= this file +remove_files.sh <= bash script for removing unnecesarry files like talk pages +create_linkindex.pl <= perl script for creating a links.list in eayh directory +remove_everything.pl <= clean up all remainig junk in the dump files +rename.pl <= rename everything + +4. edit all scripts for your language! + +heavy editing is to be done on remove_files.sh +in the perl scripts you have to change this line according to your language folder: +find(\&filehandler, "de"); + +5. delete unnecessary files and dirs in your dump + rm -r de/COPYING.html de/index.html de/skins/ de/raw/ de/images/ de/upload/ +6. run remove_files.sh +7. run create_linkindex.pl +8. now delete all links with + find de/ -size -2k -type f -name '*.html' -delete +9. run remove_everything.pl +10. run rename.pl +11. create the image file with + mksquashfs /de your_image_name +12. test it and drop me a line if sth. went wrong! diff --git a/create_linkindex.pl b/create_linkindex.pl new file mode 100755 index 0000000..e7a9044 --- /dev/null +++ b/create_linkindex.pl @@ -0,0 +1,47 @@ +#!/usr/bin/perl -w + +use File::Find; +use Encode; + +find(\&filehandler, "de"); + +sub filehandler { + if(-s $_ < 2048 and /\.html$/) { + open(BLUB, $_); + @lines = ; + close(BLUB); + if($#lines < 4) { + print "file too small: $File::Find::name\n"; + } else { + if(($href) = $lines[4] =~ //) { + + $href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri + + $href = decode_utf8($href); + #if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed + unless($href =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) { + #if this did not match it's all lowercase and has no hex to be removed + $href =~ s/^(.+?)\.html/$1/; + } + $href = encode_utf8($href); + + $_ = decode_utf8($_); + #if a filename has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed + unless($_ =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) { + #if this did not match it's all lowercase and has no hex to be removed + $_ =~ s/^(.+?)\.html$/$1/; + } + $_ = encode_utf8($_); + + $links = $ENV{PWD} . "/" . $File::Find::dir . "/links.list"; + open(LIST, ">>$links"); + print LIST "$_ $href\n"; + close(LIST); + } else { + print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n"; + } + } + } +} + +#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName diff --git a/mksquashfs b/mksquashfs new file mode 100755 index 0000000..0a8f2e4 Binary files /dev/null and b/mksquashfs differ diff --git a/mokopedia.py b/mokopedia.py new file mode 100644 index 0000000..426c637 --- /dev/null +++ b/mokopedia.py @@ -0,0 +1,78 @@ +#!/usr/bin/python + +import gtk +import bz2 +import gtkhtml2 +import time + +class Mokopedia: + document = gtkhtml2.Document() + + def delete_event(self, widget, event, data=None): + gtk.main_quit() + return False + + def __init__(self): + self.window = gtk.Window(gtk.WINDOW_TOPLEVEL) + self.window.set_title("Mokopedia") + self.window.connect("delete_event", self.delete_event) + self.window.set_border_width(0) + self.window.set_default_size(480,640) + + main_box = gtk.VBox() + + search_box = gtk.HBox() + + search_entry = gtk.Entry() + search_box.add(search_entry) + + search_btn = gtk.Button("Search") + search_btn.connect("clicked", self.search) + search_box.pack_start(search_btn, False, False, 0) + + #document.connect('request_url', request_url) + #document.connect('link_clicked', link_clicked) + + self.displayarticle("Alexander_the_Great") + + view = gtkhtml2.View() + view.set_document(self.document) + + sw = gtk.ScrolledWindow() + sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_ALWAYS) + sw.add(view) + + main_box.pack_start(search_box, False, False, 0) + + main_box.add(sw) + + self.window.add(main_box) + self.window.show_all() + + def search(self, widget): + print "blubber" + + def displayarticle(self, title): + self.document.clear() + self.document.open_stream('text/html') + + before = time.time() + #f = open(title + ".html.bz2") + f = open("Alexander_the_Great (another copy).html") + #self.document.write_stream(bz2.decompress(f.read())) + self.document.write_stream(f.read()) + f.close() + print time.time() - before + self.document.close_stream() + + +def main(): + gtk.gdk.threads_init() + gtk.gdk.threads_enter() + gtk.main() + gtk.gdk.threads_leave() + + +if (__name__ == '__main__'): + Mokopedia = Mokopedia() + main() diff --git a/remove_catlinks.pl b/remove_catlinks.pl new file mode 100755 index 0000000..a2796ba --- /dev/null +++ b/remove_catlinks.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w + +use File::Find; +use Encode; + +find(\&filehandler, "de"); + +sub filehandler { + if(-f $_ and $_ !~ /links\.list/) { #damit keine link.list dateien zerstört werden + open(BLUB, "$_"); + @lines = ; + close(BLUB); + + #Lazy... + $lines[$#lines] =~ s/
.*//; + + #änderungen speichern + open(FILE, ">$_") or print "can't write to $File::Find::name\n"; + print FILE @lines; + close(FILE); + } +} + +#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName diff --git a/remove_everything.pl b/remove_everything.pl new file mode 100755 index 0000000..d09bf5c --- /dev/null +++ b/remove_everything.pl @@ -0,0 +1,115 @@ +#!/usr/bin/perl -w + +use File::Find; +use Encode; + +find(\&filehandler, "de"); + +sub filehandler { + if(/\.html$/) { #damit keine link.list dateien zerstört werden + open(BLUB, "$_"); + @lines = ; + close(BLUB); + $i=0; + $title=""; + $beginning=0; + $ending=0; + while(!$title && $i<=$#lines) { + ($title) = $lines[$i] =~ /(.+)<\/h1>$/; + $i++ + } + + if(!$title) { #TODO detect if $title == "0" + print "title not found in $File::Find::name\n"; + } + + #Beginn suchen + while(!$beginning && $i<=$#lines) { + if($lines[$i] =~ //) { + $beginning = $i; + } + $i++ + } + #Ende suchen + while(!$ending && $i<=$#lines) { + if($lines[$i] =~ s///) { + $ending = $i; + } + $i++ + } + #ersetzten + splice(@lines,$ending+1, $#lines-$ending, ""); + splice(@lines,0,$beginning+1, "$title\n"); + #vorletzte zeile löschen da diese seperat angefügt werden wird + splice(@lines,$#lines-2, 1, ""); + + $i=0; + while($i<=$#lines) { + #a *very* dirty way to get rid of unicode chars in URLs + $lines[$i] =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; + + #needed for proper uppercase detection + $lines[$i] = decode_utf8($lines[$i]); + + #removing double spaces + $lines[$i] =~ s/[ ]{2,}//g; + + #removing tabs + $lines[$i] =~ s/\t//g; + + #removing empty lines + $lines[$i] =~ s/^\n$//g; + + #removing the comment block on the end of some aricles + if($lines[$i] =~ /^$/) { + splice(@lines, $i, 6, ""); + } + } + + #removing editsection links + $lines[$i] =~ s/.+?<\/span> //g; + + #converting tex images to tex inside of code tags + $lines[$i] =~ s/\"(.*?)\".*?\//$1<\/code>/g; + + #delete all title attributes + $lines[$i] =~ s/ title=\".*?\"//g; + + #remove rel attributes + $lines[$i] =~ s/ rel=\"nofollow\"//g; + + #if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed + $lines[$i] =~ s///g; + + #all remaining links only need to be cleaned up + $lines[$i] =~ s///g; + + #delete all thumbnail boxes + if($lines[$i] =~ /
/) { + splice(@lines,$i,7, ""); + } + + #delete all spans + $lines[$i] =~ s/(.*?)<\/span>/$1/g; + + #delete all class and style attr. + $lines[$i] =~ s/ class=\".*?\"//g; + $lines[$i] =~ s/ style=\".*?\"//g; + + #delete alle remaining images + $lines[$i] =~ s/<\/a>//g; + $lines[$i] =~ s///g; + + $lines[$i] = encode_utf8($lines[$i]); + $i++; + } + + #änderungen speichern + open(FILE, ">$_") or print "can't write to $File::Find::name\n"; + print FILE @lines; + close(FILE); + } +} + +#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName diff --git a/remove_files.sh b/remove_files.sh new file mode 100755 index 0000000..db50988 --- /dev/null +++ b/remove_files.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +#echo lösche \"Kategorie~*\" +#find de/ -type f -name "Kategorie~*" -delete +echo lösche \"Kategorie_Diskussion~*\" +find de/ -type f -name "Kategorie_Diskussion~*" -delete +echo lösche \"Bild~*\" +find de/ -type f -name "Bild~*" -delete +echo lösche \"Bild_Diskussion~*\" +find de/ -type f -name "Bild_Diskussion~*" -delete +echo lösche \"Portal~*\" +find de/ -type f -name "Portal~*" -delete +echo lösche \"Portal_Diskussion~*\" +find de/ -type f -name "Portal_Diskussion~*" -delete +echo lösche \"Diskussion~*\" +find de/ -type f -name "Diskussion~*" -delete +echo lösche \"Vorlage~*\" +find de/ -type f -name "Vorlage~*" -delete +echo lösche \"Vorlage_Diskussion~*\" +find de/ -type f -name "Vorlage_Diskussion~*" -delete +echo lösche \"Benutzer~*\" +find de/ -type f -name "Benutzer~*" -delete +echo lösche \"Benutzer_Diskussion~*\" +find de/ -type f -name "Benutzer_Diskussion~*" -delete +#echo lösche \"Spezial~*\" +#find de/ -type f -name "Spezial~*" -delete +#echo lösche \"Wikipedia~*\" +#find de/ -type f -name "Wikipedia~*" -delete +echo lösche \"Wikipedia_Diskussion~*\" +find de/ -type f -name "Wikipedia_Diskussion~*" -delete +echo lösche \"MediaWiki~*\" +find de/ -type f -name "MediaWiki~*" -delete +echo lösche \"MediaWiki_Diskussion~*\" +find de/ -type f -name "MediaWiki_Diskussion~*" -delete +echo lösche \"Hilfe~*\" +find de/ -type f -name "Hilfe~*" -delete +echo lösche \"Hilfe_Diskussion~*\" +find de/ -type f -name "Hilfe_Diskussion~*" -delete +echo lösche \"WP~*\" +find de/ -type f -name "WP~*" -delete diff --git a/rename.pl b/rename.pl new file mode 100755 index 0000000..c09e816 --- /dev/null +++ b/rename.pl @@ -0,0 +1,24 @@ +#!/usr/bin/perl -w + +use File::Find; +use Encode; + +$pwd = $ENV{PWD}; +find(\&filehandler, "de"); + +sub filehandler { + if(/\.html$/) { + $_ = decode_utf8($_); + if($_ =~ s/(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html/$1/) { + $_ = encode_utf8($_); + rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_"); + } elsif($_ =~ s/(.+?)\.html/$1/) { + $_ = encode_utf8($_); + rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_"); + } else { + print "couldn't find filename pattern in $File::Find::name\n"; + } + } +} + +#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName diff --git a/stats b/stats new file mode 100644 index 0000000..0d9a730 --- /dev/null +++ b/stats @@ -0,0 +1,71 @@ +==originial p7z: + size: 2,124,093 + dirs: 34125 + files: 1724658 + +==entpackt: + size: 26,124,760 + +==löschung unnützer dateien: + size: 15,884,704 + files: 1135661 + + **davon: + kaputt: 116 + links: 454132 + +==hinzufügen der links.list files TODO: dateigröße nach unten +#create_linkindex.pl + size: 16,004,220 + files: 1156229 + +==löschen der links: +#find de/ -size -2k -type f -name '*.html' -delete + size: 14,183,948 + files: 701044 + +==extrahieren des inhalts +#extract_content.pl + size: 8575264 + +==whitespaces und kommentare entfernen +#clean_whitespaces_comments.pl + size: 8509992 + +==editsection einträge entfernen +#clean_edits.pl + size: 8197228 + +==title attribute entfernen +#clean_titles.pl + size: 7400248 + +==tex images umwandeln +#clean_tex.pl + size: 7395216 + +==links umwandeln TODO: mit clean titles zusammenlegen +#clean_links.pl + size: 6774260 + +==thumbnail boxen löschen +#clean_thumbnails.pl + size: 6515720 + +==spans, class attr, style attr. löschen +#clean_css_markup.pl + size: 5995296 + +==bilder löschen +#clean_images.pl + size: 5730456 + +==alle umbenennen +find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?[[:upper:]]+.*?)(_[a-f0-9]{4})\.html$/$1$2/' -- + +find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?)\.html$/$1$2/' -- + + +in april: 664444 items, totalling 3.5 GB => 1GB + 701044 => 1.1GB + diff --git a/test.pl b/test.pl new file mode 100644 index 0000000..f54297f --- /dev/null +++ b/test.pl @@ -0,0 +1,7 @@ +#!/usr/bin/perl -w + +$test = "%6d%69%74%73%75%68%69%6b%6f%40%75%62%75%6e%74%75%2e%63%6f%6d"; + +$test =~ s/%([0-9a-f]{2})/chr(hex($1))/eg; + +print $test; diff --git a/time b/time new file mode 100644 index 0000000..aff4732 --- /dev/null +++ b/time @@ -0,0 +1,9 @@ +remove 11m +extract 110m +remove_files 22m +create_links - +delete_links 12m +remove_everyth 118m +remove_catlink 31m +rename 11m +mksquashfs 62m