initial commit
This commit is contained in:
commit
2329e4a4df
11 changed files with 459 additions and 0 deletions
44
README
Normal file
44
README
Normal file
|
@ -0,0 +1,44 @@
|
||||||
|
Be careful! What you do is only for testing and submitting bugs!
|
||||||
|
Some important features may be correctly implemented!
|
||||||
|
|
||||||
|
mail me: j [dot] schauer [at] email [dot] de
|
||||||
|
|
||||||
|
All code is released unter GPLv3 or later
|
||||||
|
|
||||||
|
|
||||||
|
1. install mksquashfs-tools
|
||||||
|
on ubuntu you do this via
|
||||||
|
sudo apt-get install squashfs-tools
|
||||||
|
|
||||||
|
2. Obtain an wikipedia html dump from static.wikipedia.org
|
||||||
|
be careful! the august 2007 version of the english wikipdia is 120GB extracted!!
|
||||||
|
|
||||||
|
3. extract it to the folder where your scripts are located
|
||||||
|
7zr x wikipedia-de-html.7z
|
||||||
|
|
||||||
|
your directory structure for eg. german wikipedia dump (26GB extracted) should look like this:
|
||||||
|
|
||||||
|
de/ <= all your extracted files
|
||||||
|
README <= this file
|
||||||
|
remove_files.sh <= bash script for removing unnecesarry files like talk pages
|
||||||
|
create_linkindex.pl <= perl script for creating a links.list in eayh directory
|
||||||
|
remove_everything.pl <= clean up all remainig junk in the dump files
|
||||||
|
rename.pl <= rename everything
|
||||||
|
|
||||||
|
4. edit all scripts for your language!
|
||||||
|
|
||||||
|
heavy editing is to be done on remove_files.sh
|
||||||
|
in the perl scripts you have to change this line according to your language folder:
|
||||||
|
find(\&filehandler, "de");
|
||||||
|
|
||||||
|
5. delete unnecessary files and dirs in your dump
|
||||||
|
rm -r de/COPYING.html de/index.html de/skins/ de/raw/ de/images/ de/upload/
|
||||||
|
6. run remove_files.sh
|
||||||
|
7. run create_linkindex.pl
|
||||||
|
8. now delete all links with
|
||||||
|
find de/ -size -2k -type f -name '*.html' -delete
|
||||||
|
9. run remove_everything.pl
|
||||||
|
10. run rename.pl
|
||||||
|
11. create the image file with
|
||||||
|
mksquashfs /de your_image_name
|
||||||
|
12. test it and drop me a line if sth. went wrong!
|
47
create_linkindex.pl
Executable file
47
create_linkindex.pl
Executable file
|
@ -0,0 +1,47 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use File::Find;
|
||||||
|
use Encode;
|
||||||
|
|
||||||
|
find(\&filehandler, "de");
|
||||||
|
|
||||||
|
sub filehandler {
|
||||||
|
if(-s $_ < 2048 and /\.html$/) {
|
||||||
|
open(BLUB, $_);
|
||||||
|
@lines = <BLUB>;
|
||||||
|
close(BLUB);
|
||||||
|
if($#lines < 4) {
|
||||||
|
print "file too small: $File::Find::name\n";
|
||||||
|
} else {
|
||||||
|
if(($href) = $lines[4] =~ /<meta http-equiv=\"Refresh\" content=\"0;url=..\/..\/..\/.{1,9}\/.{1,9}\/.{1,9}\/(.*?\.html)\" \/>/) {
|
||||||
|
|
||||||
|
$href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri
|
||||||
|
|
||||||
|
$href = decode_utf8($href);
|
||||||
|
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||||||
|
unless($href =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
|
||||||
|
#if this did not match it's all lowercase and has no hex to be removed
|
||||||
|
$href =~ s/^(.+?)\.html/$1/;
|
||||||
|
}
|
||||||
|
$href = encode_utf8($href);
|
||||||
|
|
||||||
|
$_ = decode_utf8($_);
|
||||||
|
#if a filename has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||||||
|
unless($_ =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
|
||||||
|
#if this did not match it's all lowercase and has no hex to be removed
|
||||||
|
$_ =~ s/^(.+?)\.html$/$1/;
|
||||||
|
}
|
||||||
|
$_ = encode_utf8($_);
|
||||||
|
|
||||||
|
$links = $ENV{PWD} . "/" . $File::Find::dir . "/links.list";
|
||||||
|
open(LIST, ">>$links");
|
||||||
|
print LIST "$_ $href\n";
|
||||||
|
close(LIST);
|
||||||
|
} else {
|
||||||
|
print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
BIN
mksquashfs
Executable file
BIN
mksquashfs
Executable file
Binary file not shown.
78
mokopedia.py
Normal file
78
mokopedia.py
Normal file
|
@ -0,0 +1,78 @@
|
||||||
|
#!/usr/bin/python
|
||||||
|
|
||||||
|
import gtk
|
||||||
|
import bz2
|
||||||
|
import gtkhtml2
|
||||||
|
import time
|
||||||
|
|
||||||
|
class Mokopedia:
|
||||||
|
document = gtkhtml2.Document()
|
||||||
|
|
||||||
|
def delete_event(self, widget, event, data=None):
|
||||||
|
gtk.main_quit()
|
||||||
|
return False
|
||||||
|
|
||||||
|
def __init__(self):
|
||||||
|
self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
|
||||||
|
self.window.set_title("Mokopedia")
|
||||||
|
self.window.connect("delete_event", self.delete_event)
|
||||||
|
self.window.set_border_width(0)
|
||||||
|
self.window.set_default_size(480,640)
|
||||||
|
|
||||||
|
main_box = gtk.VBox()
|
||||||
|
|
||||||
|
search_box = gtk.HBox()
|
||||||
|
|
||||||
|
search_entry = gtk.Entry()
|
||||||
|
search_box.add(search_entry)
|
||||||
|
|
||||||
|
search_btn = gtk.Button("Search")
|
||||||
|
search_btn.connect("clicked", self.search)
|
||||||
|
search_box.pack_start(search_btn, False, False, 0)
|
||||||
|
|
||||||
|
#document.connect('request_url', request_url)
|
||||||
|
#document.connect('link_clicked', link_clicked)
|
||||||
|
|
||||||
|
self.displayarticle("Alexander_the_Great")
|
||||||
|
|
||||||
|
view = gtkhtml2.View()
|
||||||
|
view.set_document(self.document)
|
||||||
|
|
||||||
|
sw = gtk.ScrolledWindow()
|
||||||
|
sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_ALWAYS)
|
||||||
|
sw.add(view)
|
||||||
|
|
||||||
|
main_box.pack_start(search_box, False, False, 0)
|
||||||
|
|
||||||
|
main_box.add(sw)
|
||||||
|
|
||||||
|
self.window.add(main_box)
|
||||||
|
self.window.show_all()
|
||||||
|
|
||||||
|
def search(self, widget):
|
||||||
|
print "blubber"
|
||||||
|
|
||||||
|
def displayarticle(self, title):
|
||||||
|
self.document.clear()
|
||||||
|
self.document.open_stream('text/html')
|
||||||
|
|
||||||
|
before = time.time()
|
||||||
|
#f = open(title + ".html.bz2")
|
||||||
|
f = open("Alexander_the_Great (another copy).html")
|
||||||
|
#self.document.write_stream(bz2.decompress(f.read()))
|
||||||
|
self.document.write_stream(f.read())
|
||||||
|
f.close()
|
||||||
|
print time.time() - before
|
||||||
|
self.document.close_stream()
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
gtk.gdk.threads_init()
|
||||||
|
gtk.gdk.threads_enter()
|
||||||
|
gtk.main()
|
||||||
|
gtk.gdk.threads_leave()
|
||||||
|
|
||||||
|
|
||||||
|
if (__name__ == '__main__'):
|
||||||
|
Mokopedia = Mokopedia()
|
||||||
|
main()
|
24
remove_catlinks.pl
Executable file
24
remove_catlinks.pl
Executable file
|
@ -0,0 +1,24 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use File::Find;
|
||||||
|
use Encode;
|
||||||
|
|
||||||
|
find(\&filehandler, "de");
|
||||||
|
|
||||||
|
sub filehandler {
|
||||||
|
if(-f $_ and $_ !~ /links\.list/) { #damit keine link.list dateien zerstört werden
|
||||||
|
open(BLUB, "$_");
|
||||||
|
@lines = <BLUB>;
|
||||||
|
close(BLUB);
|
||||||
|
|
||||||
|
#Lazy...
|
||||||
|
$lines[$#lines] =~ s/<div id=\"catlinks\">.*//;
|
||||||
|
|
||||||
|
#änderungen speichern
|
||||||
|
open(FILE, ">$_") or print "can't write to $File::Find::name\n";
|
||||||
|
print FILE @lines;
|
||||||
|
close(FILE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
115
remove_everything.pl
Executable file
115
remove_everything.pl
Executable file
|
@ -0,0 +1,115 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use File::Find;
|
||||||
|
use Encode;
|
||||||
|
|
||||||
|
find(\&filehandler, "de");
|
||||||
|
|
||||||
|
sub filehandler {
|
||||||
|
if(/\.html$/) { #damit keine link.list dateien zerstört werden
|
||||||
|
open(BLUB, "$_");
|
||||||
|
@lines = <BLUB>;
|
||||||
|
close(BLUB);
|
||||||
|
$i=0;
|
||||||
|
$title="";
|
||||||
|
$beginning=0;
|
||||||
|
$ending=0;
|
||||||
|
while(!$title && $i<=$#lines) {
|
||||||
|
($title) = $lines[$i] =~ /<h1.*?>(.+)<\/h1>$/;
|
||||||
|
$i++
|
||||||
|
}
|
||||||
|
|
||||||
|
if(!$title) { #TODO detect if $title == "0"
|
||||||
|
print "title not found in $File::Find::name\n";
|
||||||
|
}
|
||||||
|
|
||||||
|
#Beginn suchen
|
||||||
|
while(!$beginning && $i<=$#lines) {
|
||||||
|
if($lines[$i] =~ /<!-- start content -->/) {
|
||||||
|
$beginning = $i;
|
||||||
|
}
|
||||||
|
$i++
|
||||||
|
}
|
||||||
|
#Ende suchen
|
||||||
|
while(!$ending && $i<=$#lines) {
|
||||||
|
if($lines[$i] =~ s/<!-- end content -->//) {
|
||||||
|
$ending = $i;
|
||||||
|
}
|
||||||
|
$i++
|
||||||
|
}
|
||||||
|
#ersetzten
|
||||||
|
splice(@lines,$ending+1, $#lines-$ending, "");
|
||||||
|
splice(@lines,0,$beginning+1, "$title\n");
|
||||||
|
#vorletzte zeile löschen da diese seperat angefügt werden wird
|
||||||
|
splice(@lines,$#lines-2, 1, "");
|
||||||
|
|
||||||
|
$i=0;
|
||||||
|
while($i<=$#lines) {
|
||||||
|
#a *very* dirty way to get rid of unicode chars in URLs
|
||||||
|
$lines[$i] =~ s/%([0-9A-F]{2})/chr(hex($1))/eg;
|
||||||
|
|
||||||
|
#needed for proper uppercase detection
|
||||||
|
$lines[$i] = decode_utf8($lines[$i]);
|
||||||
|
|
||||||
|
#removing double spaces
|
||||||
|
$lines[$i] =~ s/[ ]{2,}//g;
|
||||||
|
|
||||||
|
#removing tabs
|
||||||
|
$lines[$i] =~ s/\t//g;
|
||||||
|
|
||||||
|
#removing empty lines
|
||||||
|
$lines[$i] =~ s/^\n$//g;
|
||||||
|
|
||||||
|
#removing the comment block on the end of some aricles
|
||||||
|
if($lines[$i] =~ /^<!-- $/) {
|
||||||
|
if($lines[$i+5] =~ /^-->$/) {
|
||||||
|
splice(@lines, $i, 6, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#removing editsection links
|
||||||
|
$lines[$i] =~ s/<span class=\"editsection\">.+?<\/span> //g;
|
||||||
|
|
||||||
|
#converting tex images to tex inside of code tags
|
||||||
|
$lines[$i] =~ s/<img class=\"tex\" alt=\"(.*?)\".*?\/>/<code>$1<\/code>/g;
|
||||||
|
|
||||||
|
#delete all title attributes
|
||||||
|
$lines[$i] =~ s/ title=\".*?\"//g;
|
||||||
|
|
||||||
|
#remove rel attributes
|
||||||
|
$lines[$i] =~ s/ rel=\"nofollow\"//g;
|
||||||
|
|
||||||
|
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||||||
|
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?\p{Lu}+[^\"]*?)_[a-f0-9]{4}\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
|
||||||
|
|
||||||
|
#all remaining links only need to be cleaned up
|
||||||
|
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?)\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
|
||||||
|
|
||||||
|
#delete all thumbnail boxes
|
||||||
|
if($lines[$i] =~ /<div class=\"thumb t(right|left)\">/) {
|
||||||
|
splice(@lines,$i,7, "");
|
||||||
|
}
|
||||||
|
|
||||||
|
#delete all spans
|
||||||
|
$lines[$i] =~ s/<span .*?>(.*?)<\/span>/$1/g;
|
||||||
|
|
||||||
|
#delete all class and style attr.
|
||||||
|
$lines[$i] =~ s/ class=\".*?\"//g;
|
||||||
|
$lines[$i] =~ s/ style=\".*?\"//g;
|
||||||
|
|
||||||
|
#delete alle remaining images
|
||||||
|
$lines[$i] =~ s/<a .*?><img .*?\/><\/a>//g;
|
||||||
|
$lines[$i] =~ s/<img .*?\/>//g;
|
||||||
|
|
||||||
|
$lines[$i] = encode_utf8($lines[$i]);
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
||||||
|
#änderungen speichern
|
||||||
|
open(FILE, ">$_") or print "can't write to $File::Find::name\n";
|
||||||
|
print FILE @lines;
|
||||||
|
close(FILE);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
40
remove_files.sh
Executable file
40
remove_files.sh
Executable file
|
@ -0,0 +1,40 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#echo lösche \"Kategorie~*\"
|
||||||
|
#find de/ -type f -name "Kategorie~*" -delete
|
||||||
|
echo lösche \"Kategorie_Diskussion~*\"
|
||||||
|
find de/ -type f -name "Kategorie_Diskussion~*" -delete
|
||||||
|
echo lösche \"Bild~*\"
|
||||||
|
find de/ -type f -name "Bild~*" -delete
|
||||||
|
echo lösche \"Bild_Diskussion~*\"
|
||||||
|
find de/ -type f -name "Bild_Diskussion~*" -delete
|
||||||
|
echo lösche \"Portal~*\"
|
||||||
|
find de/ -type f -name "Portal~*" -delete
|
||||||
|
echo lösche \"Portal_Diskussion~*\"
|
||||||
|
find de/ -type f -name "Portal_Diskussion~*" -delete
|
||||||
|
echo lösche \"Diskussion~*\"
|
||||||
|
find de/ -type f -name "Diskussion~*" -delete
|
||||||
|
echo lösche \"Vorlage~*\"
|
||||||
|
find de/ -type f -name "Vorlage~*" -delete
|
||||||
|
echo lösche \"Vorlage_Diskussion~*\"
|
||||||
|
find de/ -type f -name "Vorlage_Diskussion~*" -delete
|
||||||
|
echo lösche \"Benutzer~*\"
|
||||||
|
find de/ -type f -name "Benutzer~*" -delete
|
||||||
|
echo lösche \"Benutzer_Diskussion~*\"
|
||||||
|
find de/ -type f -name "Benutzer_Diskussion~*" -delete
|
||||||
|
#echo lösche \"Spezial~*\"
|
||||||
|
#find de/ -type f -name "Spezial~*" -delete
|
||||||
|
#echo lösche \"Wikipedia~*\"
|
||||||
|
#find de/ -type f -name "Wikipedia~*" -delete
|
||||||
|
echo lösche \"Wikipedia_Diskussion~*\"
|
||||||
|
find de/ -type f -name "Wikipedia_Diskussion~*" -delete
|
||||||
|
echo lösche \"MediaWiki~*\"
|
||||||
|
find de/ -type f -name "MediaWiki~*" -delete
|
||||||
|
echo lösche \"MediaWiki_Diskussion~*\"
|
||||||
|
find de/ -type f -name "MediaWiki_Diskussion~*" -delete
|
||||||
|
echo lösche \"Hilfe~*\"
|
||||||
|
find de/ -type f -name "Hilfe~*" -delete
|
||||||
|
echo lösche \"Hilfe_Diskussion~*\"
|
||||||
|
find de/ -type f -name "Hilfe_Diskussion~*" -delete
|
||||||
|
echo lösche \"WP~*\"
|
||||||
|
find de/ -type f -name "WP~*" -delete
|
24
rename.pl
Executable file
24
rename.pl
Executable file
|
@ -0,0 +1,24 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use File::Find;
|
||||||
|
use Encode;
|
||||||
|
|
||||||
|
$pwd = $ENV{PWD};
|
||||||
|
find(\&filehandler, "de");
|
||||||
|
|
||||||
|
sub filehandler {
|
||||||
|
if(/\.html$/) {
|
||||||
|
$_ = decode_utf8($_);
|
||||||
|
if($_ =~ s/(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html/$1/) {
|
||||||
|
$_ = encode_utf8($_);
|
||||||
|
rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_");
|
||||||
|
} elsif($_ =~ s/(.+?)\.html/$1/) {
|
||||||
|
$_ = encode_utf8($_);
|
||||||
|
rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_");
|
||||||
|
} else {
|
||||||
|
print "couldn't find filename pattern in $File::Find::name\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
71
stats
Normal file
71
stats
Normal file
|
@ -0,0 +1,71 @@
|
||||||
|
==originial p7z:
|
||||||
|
size: 2,124,093
|
||||||
|
dirs: 34125
|
||||||
|
files: 1724658
|
||||||
|
|
||||||
|
==entpackt:
|
||||||
|
size: 26,124,760
|
||||||
|
|
||||||
|
==löschung unnützer dateien:
|
||||||
|
size: 15,884,704
|
||||||
|
files: 1135661
|
||||||
|
|
||||||
|
**davon:
|
||||||
|
kaputt: 116
|
||||||
|
links: 454132
|
||||||
|
|
||||||
|
==hinzufügen der links.list files TODO: dateigröße nach unten
|
||||||
|
#create_linkindex.pl
|
||||||
|
size: 16,004,220
|
||||||
|
files: 1156229
|
||||||
|
|
||||||
|
==löschen der links:
|
||||||
|
#find de/ -size -2k -type f -name '*.html' -delete
|
||||||
|
size: 14,183,948
|
||||||
|
files: 701044
|
||||||
|
|
||||||
|
==extrahieren des inhalts
|
||||||
|
#extract_content.pl
|
||||||
|
size: 8575264
|
||||||
|
|
||||||
|
==whitespaces und kommentare entfernen
|
||||||
|
#clean_whitespaces_comments.pl
|
||||||
|
size: 8509992
|
||||||
|
|
||||||
|
==editsection einträge entfernen
|
||||||
|
#clean_edits.pl
|
||||||
|
size: 8197228
|
||||||
|
|
||||||
|
==title attribute entfernen
|
||||||
|
#clean_titles.pl
|
||||||
|
size: 7400248
|
||||||
|
|
||||||
|
==tex images umwandeln
|
||||||
|
#clean_tex.pl
|
||||||
|
size: 7395216
|
||||||
|
|
||||||
|
==links umwandeln TODO: mit clean titles zusammenlegen
|
||||||
|
#clean_links.pl
|
||||||
|
size: 6774260
|
||||||
|
|
||||||
|
==thumbnail boxen löschen
|
||||||
|
#clean_thumbnails.pl
|
||||||
|
size: 6515720
|
||||||
|
|
||||||
|
==spans, class attr, style attr. löschen
|
||||||
|
#clean_css_markup.pl
|
||||||
|
size: 5995296
|
||||||
|
|
||||||
|
==bilder löschen
|
||||||
|
#clean_images.pl
|
||||||
|
size: 5730456
|
||||||
|
|
||||||
|
==alle umbenennen
|
||||||
|
find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?[[:upper:]]+.*?)(_[a-f0-9]{4})\.html$/$1$2/' --
|
||||||
|
|
||||||
|
find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?)\.html$/$1$2/' --
|
||||||
|
|
||||||
|
|
||||||
|
in april: 664444 items, totalling 3.5 GB => 1GB
|
||||||
|
701044 => 1.1GB
|
||||||
|
|
7
test.pl
Normal file
7
test.pl
Normal file
|
@ -0,0 +1,7 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
$test = "%6d%69%74%73%75%68%69%6b%6f%40%75%62%75%6e%74%75%2e%63%6f%6d";
|
||||||
|
|
||||||
|
$test =~ s/%([0-9a-f]{2})/chr(hex($1))/eg;
|
||||||
|
|
||||||
|
print $test;
|
9
time
Normal file
9
time
Normal file
|
@ -0,0 +1,9 @@
|
||||||
|
remove 11m
|
||||||
|
extract 110m
|
||||||
|
remove_files 22m
|
||||||
|
create_links -
|
||||||
|
delete_links 12m
|
||||||
|
remove_everyth 118m
|
||||||
|
remove_catlink 31m
|
||||||
|
rename 11m
|
||||||
|
mksquashfs 62m
|
Loading…
Reference in a new issue