initial commit
This commit is contained in:
commit
2329e4a4df
11 changed files with 459 additions and 0 deletions
44
README
Normal file
44
README
Normal file
|
@ -0,0 +1,44 @@
|
|||
Be careful! What you do is only for testing and submitting bugs!
|
||||
Some important features may be correctly implemented!
|
||||
|
||||
mail me: j [dot] schauer [at] email [dot] de
|
||||
|
||||
All code is released unter GPLv3 or later
|
||||
|
||||
|
||||
1. install mksquashfs-tools
|
||||
on ubuntu you do this via
|
||||
sudo apt-get install squashfs-tools
|
||||
|
||||
2. Obtain an wikipedia html dump from static.wikipedia.org
|
||||
be careful! the august 2007 version of the english wikipdia is 120GB extracted!!
|
||||
|
||||
3. extract it to the folder where your scripts are located
|
||||
7zr x wikipedia-de-html.7z
|
||||
|
||||
your directory structure for eg. german wikipedia dump (26GB extracted) should look like this:
|
||||
|
||||
de/ <= all your extracted files
|
||||
README <= this file
|
||||
remove_files.sh <= bash script for removing unnecesarry files like talk pages
|
||||
create_linkindex.pl <= perl script for creating a links.list in eayh directory
|
||||
remove_everything.pl <= clean up all remainig junk in the dump files
|
||||
rename.pl <= rename everything
|
||||
|
||||
4. edit all scripts for your language!
|
||||
|
||||
heavy editing is to be done on remove_files.sh
|
||||
in the perl scripts you have to change this line according to your language folder:
|
||||
find(\&filehandler, "de");
|
||||
|
||||
5. delete unnecessary files and dirs in your dump
|
||||
rm -r de/COPYING.html de/index.html de/skins/ de/raw/ de/images/ de/upload/
|
||||
6. run remove_files.sh
|
||||
7. run create_linkindex.pl
|
||||
8. now delete all links with
|
||||
find de/ -size -2k -type f -name '*.html' -delete
|
||||
9. run remove_everything.pl
|
||||
10. run rename.pl
|
||||
11. create the image file with
|
||||
mksquashfs /de your_image_name
|
||||
12. test it and drop me a line if sth. went wrong!
|
47
create_linkindex.pl
Executable file
47
create_linkindex.pl
Executable file
|
@ -0,0 +1,47 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use File::Find;
|
||||
use Encode;
|
||||
|
||||
find(\&filehandler, "de");
|
||||
|
||||
sub filehandler {
|
||||
if(-s $_ < 2048 and /\.html$/) {
|
||||
open(BLUB, $_);
|
||||
@lines = <BLUB>;
|
||||
close(BLUB);
|
||||
if($#lines < 4) {
|
||||
print "file too small: $File::Find::name\n";
|
||||
} else {
|
||||
if(($href) = $lines[4] =~ /<meta http-equiv=\"Refresh\" content=\"0;url=..\/..\/..\/.{1,9}\/.{1,9}\/.{1,9}\/(.*?\.html)\" \/>/) {
|
||||
|
||||
$href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri
|
||||
|
||||
$href = decode_utf8($href);
|
||||
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||||
unless($href =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
|
||||
#if this did not match it's all lowercase and has no hex to be removed
|
||||
$href =~ s/^(.+?)\.html/$1/;
|
||||
}
|
||||
$href = encode_utf8($href);
|
||||
|
||||
$_ = decode_utf8($_);
|
||||
#if a filename has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||||
unless($_ =~ s/^(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html$/$1/) {
|
||||
#if this did not match it's all lowercase and has no hex to be removed
|
||||
$_ =~ s/^(.+?)\.html$/$1/;
|
||||
}
|
||||
$_ = encode_utf8($_);
|
||||
|
||||
$links = $ENV{PWD} . "/" . $File::Find::dir . "/links.list";
|
||||
open(LIST, ">>$links");
|
||||
print LIST "$_ $href\n";
|
||||
close(LIST);
|
||||
} else {
|
||||
print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
BIN
mksquashfs
Executable file
BIN
mksquashfs
Executable file
Binary file not shown.
78
mokopedia.py
Normal file
78
mokopedia.py
Normal file
|
@ -0,0 +1,78 @@
|
|||
#!/usr/bin/python
|
||||
|
||||
import gtk
|
||||
import bz2
|
||||
import gtkhtml2
|
||||
import time
|
||||
|
||||
class Mokopedia:
|
||||
document = gtkhtml2.Document()
|
||||
|
||||
def delete_event(self, widget, event, data=None):
|
||||
gtk.main_quit()
|
||||
return False
|
||||
|
||||
def __init__(self):
|
||||
self.window = gtk.Window(gtk.WINDOW_TOPLEVEL)
|
||||
self.window.set_title("Mokopedia")
|
||||
self.window.connect("delete_event", self.delete_event)
|
||||
self.window.set_border_width(0)
|
||||
self.window.set_default_size(480,640)
|
||||
|
||||
main_box = gtk.VBox()
|
||||
|
||||
search_box = gtk.HBox()
|
||||
|
||||
search_entry = gtk.Entry()
|
||||
search_box.add(search_entry)
|
||||
|
||||
search_btn = gtk.Button("Search")
|
||||
search_btn.connect("clicked", self.search)
|
||||
search_box.pack_start(search_btn, False, False, 0)
|
||||
|
||||
#document.connect('request_url', request_url)
|
||||
#document.connect('link_clicked', link_clicked)
|
||||
|
||||
self.displayarticle("Alexander_the_Great")
|
||||
|
||||
view = gtkhtml2.View()
|
||||
view.set_document(self.document)
|
||||
|
||||
sw = gtk.ScrolledWindow()
|
||||
sw.set_policy(gtk.POLICY_AUTOMATIC, gtk.POLICY_ALWAYS)
|
||||
sw.add(view)
|
||||
|
||||
main_box.pack_start(search_box, False, False, 0)
|
||||
|
||||
main_box.add(sw)
|
||||
|
||||
self.window.add(main_box)
|
||||
self.window.show_all()
|
||||
|
||||
def search(self, widget):
|
||||
print "blubber"
|
||||
|
||||
def displayarticle(self, title):
|
||||
self.document.clear()
|
||||
self.document.open_stream('text/html')
|
||||
|
||||
before = time.time()
|
||||
#f = open(title + ".html.bz2")
|
||||
f = open("Alexander_the_Great (another copy).html")
|
||||
#self.document.write_stream(bz2.decompress(f.read()))
|
||||
self.document.write_stream(f.read())
|
||||
f.close()
|
||||
print time.time() - before
|
||||
self.document.close_stream()
|
||||
|
||||
|
||||
def main():
|
||||
gtk.gdk.threads_init()
|
||||
gtk.gdk.threads_enter()
|
||||
gtk.main()
|
||||
gtk.gdk.threads_leave()
|
||||
|
||||
|
||||
if (__name__ == '__main__'):
|
||||
Mokopedia = Mokopedia()
|
||||
main()
|
24
remove_catlinks.pl
Executable file
24
remove_catlinks.pl
Executable file
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use File::Find;
|
||||
use Encode;
|
||||
|
||||
find(\&filehandler, "de");
|
||||
|
||||
sub filehandler {
|
||||
if(-f $_ and $_ !~ /links\.list/) { #damit keine link.list dateien zerstört werden
|
||||
open(BLUB, "$_");
|
||||
@lines = <BLUB>;
|
||||
close(BLUB);
|
||||
|
||||
#Lazy...
|
||||
$lines[$#lines] =~ s/<div id=\"catlinks\">.*//;
|
||||
|
||||
#änderungen speichern
|
||||
open(FILE, ">$_") or print "can't write to $File::Find::name\n";
|
||||
print FILE @lines;
|
||||
close(FILE);
|
||||
}
|
||||
}
|
||||
|
||||
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
115
remove_everything.pl
Executable file
115
remove_everything.pl
Executable file
|
@ -0,0 +1,115 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use File::Find;
|
||||
use Encode;
|
||||
|
||||
find(\&filehandler, "de");
|
||||
|
||||
sub filehandler {
|
||||
if(/\.html$/) { #damit keine link.list dateien zerstört werden
|
||||
open(BLUB, "$_");
|
||||
@lines = <BLUB>;
|
||||
close(BLUB);
|
||||
$i=0;
|
||||
$title="";
|
||||
$beginning=0;
|
||||
$ending=0;
|
||||
while(!$title && $i<=$#lines) {
|
||||
($title) = $lines[$i] =~ /<h1.*?>(.+)<\/h1>$/;
|
||||
$i++
|
||||
}
|
||||
|
||||
if(!$title) { #TODO detect if $title == "0"
|
||||
print "title not found in $File::Find::name\n";
|
||||
}
|
||||
|
||||
#Beginn suchen
|
||||
while(!$beginning && $i<=$#lines) {
|
||||
if($lines[$i] =~ /<!-- start content -->/) {
|
||||
$beginning = $i;
|
||||
}
|
||||
$i++
|
||||
}
|
||||
#Ende suchen
|
||||
while(!$ending && $i<=$#lines) {
|
||||
if($lines[$i] =~ s/<!-- end content -->//) {
|
||||
$ending = $i;
|
||||
}
|
||||
$i++
|
||||
}
|
||||
#ersetzten
|
||||
splice(@lines,$ending+1, $#lines-$ending, "");
|
||||
splice(@lines,0,$beginning+1, "$title\n");
|
||||
#vorletzte zeile löschen da diese seperat angefügt werden wird
|
||||
splice(@lines,$#lines-2, 1, "");
|
||||
|
||||
$i=0;
|
||||
while($i<=$#lines) {
|
||||
#a *very* dirty way to get rid of unicode chars in URLs
|
||||
$lines[$i] =~ s/%([0-9A-F]{2})/chr(hex($1))/eg;
|
||||
|
||||
#needed for proper uppercase detection
|
||||
$lines[$i] = decode_utf8($lines[$i]);
|
||||
|
||||
#removing double spaces
|
||||
$lines[$i] =~ s/[ ]{2,}//g;
|
||||
|
||||
#removing tabs
|
||||
$lines[$i] =~ s/\t//g;
|
||||
|
||||
#removing empty lines
|
||||
$lines[$i] =~ s/^\n$//g;
|
||||
|
||||
#removing the comment block on the end of some aricles
|
||||
if($lines[$i] =~ /^<!-- $/) {
|
||||
if($lines[$i+5] =~ /^-->$/) {
|
||||
splice(@lines, $i, 6, "");
|
||||
}
|
||||
}
|
||||
|
||||
#removing editsection links
|
||||
$lines[$i] =~ s/<span class=\"editsection\">.+?<\/span> //g;
|
||||
|
||||
#converting tex images to tex inside of code tags
|
||||
$lines[$i] =~ s/<img class=\"tex\" alt=\"(.*?)\".*?\/>/<code>$1<\/code>/g;
|
||||
|
||||
#delete all title attributes
|
||||
$lines[$i] =~ s/ title=\".*?\"//g;
|
||||
|
||||
#remove rel attributes
|
||||
$lines[$i] =~ s/ rel=\"nofollow\"//g;
|
||||
|
||||
#if a link has an uppercase letter beyond the first letter it has 4 hex digits on the end wich have to be removed
|
||||
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?\p{Lu}+[^\"]*?)_[a-f0-9]{4}\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
|
||||
|
||||
#all remaining links only need to be cleaned up
|
||||
$lines[$i] =~ s/<a href=\"\.\.\/\.\.\/\.\.\/.{1,9}\/.{1,9}\/.{1,9}\/([^\"]+?)\.html(#?[^\"]*?)\">/<a href=\"$1$2\">/g;
|
||||
|
||||
#delete all thumbnail boxes
|
||||
if($lines[$i] =~ /<div class=\"thumb t(right|left)\">/) {
|
||||
splice(@lines,$i,7, "");
|
||||
}
|
||||
|
||||
#delete all spans
|
||||
$lines[$i] =~ s/<span .*?>(.*?)<\/span>/$1/g;
|
||||
|
||||
#delete all class and style attr.
|
||||
$lines[$i] =~ s/ class=\".*?\"//g;
|
||||
$lines[$i] =~ s/ style=\".*?\"//g;
|
||||
|
||||
#delete alle remaining images
|
||||
$lines[$i] =~ s/<a .*?><img .*?\/><\/a>//g;
|
||||
$lines[$i] =~ s/<img .*?\/>//g;
|
||||
|
||||
$lines[$i] = encode_utf8($lines[$i]);
|
||||
$i++;
|
||||
}
|
||||
|
||||
#änderungen speichern
|
||||
open(FILE, ">$_") or print "can't write to $File::Find::name\n";
|
||||
print FILE @lines;
|
||||
close(FILE);
|
||||
}
|
||||
}
|
||||
|
||||
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
40
remove_files.sh
Executable file
40
remove_files.sh
Executable file
|
@ -0,0 +1,40 @@
|
|||
#!/bin/bash
|
||||
|
||||
#echo lösche \"Kategorie~*\"
|
||||
#find de/ -type f -name "Kategorie~*" -delete
|
||||
echo lösche \"Kategorie_Diskussion~*\"
|
||||
find de/ -type f -name "Kategorie_Diskussion~*" -delete
|
||||
echo lösche \"Bild~*\"
|
||||
find de/ -type f -name "Bild~*" -delete
|
||||
echo lösche \"Bild_Diskussion~*\"
|
||||
find de/ -type f -name "Bild_Diskussion~*" -delete
|
||||
echo lösche \"Portal~*\"
|
||||
find de/ -type f -name "Portal~*" -delete
|
||||
echo lösche \"Portal_Diskussion~*\"
|
||||
find de/ -type f -name "Portal_Diskussion~*" -delete
|
||||
echo lösche \"Diskussion~*\"
|
||||
find de/ -type f -name "Diskussion~*" -delete
|
||||
echo lösche \"Vorlage~*\"
|
||||
find de/ -type f -name "Vorlage~*" -delete
|
||||
echo lösche \"Vorlage_Diskussion~*\"
|
||||
find de/ -type f -name "Vorlage_Diskussion~*" -delete
|
||||
echo lösche \"Benutzer~*\"
|
||||
find de/ -type f -name "Benutzer~*" -delete
|
||||
echo lösche \"Benutzer_Diskussion~*\"
|
||||
find de/ -type f -name "Benutzer_Diskussion~*" -delete
|
||||
#echo lösche \"Spezial~*\"
|
||||
#find de/ -type f -name "Spezial~*" -delete
|
||||
#echo lösche \"Wikipedia~*\"
|
||||
#find de/ -type f -name "Wikipedia~*" -delete
|
||||
echo lösche \"Wikipedia_Diskussion~*\"
|
||||
find de/ -type f -name "Wikipedia_Diskussion~*" -delete
|
||||
echo lösche \"MediaWiki~*\"
|
||||
find de/ -type f -name "MediaWiki~*" -delete
|
||||
echo lösche \"MediaWiki_Diskussion~*\"
|
||||
find de/ -type f -name "MediaWiki_Diskussion~*" -delete
|
||||
echo lösche \"Hilfe~*\"
|
||||
find de/ -type f -name "Hilfe~*" -delete
|
||||
echo lösche \"Hilfe_Diskussion~*\"
|
||||
find de/ -type f -name "Hilfe_Diskussion~*" -delete
|
||||
echo lösche \"WP~*\"
|
||||
find de/ -type f -name "WP~*" -delete
|
24
rename.pl
Executable file
24
rename.pl
Executable file
|
@ -0,0 +1,24 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use File::Find;
|
||||
use Encode;
|
||||
|
||||
$pwd = $ENV{PWD};
|
||||
find(\&filehandler, "de");
|
||||
|
||||
sub filehandler {
|
||||
if(/\.html$/) {
|
||||
$_ = decode_utf8($_);
|
||||
if($_ =~ s/(.+?\p{Lu}+.*?)_[a-f0-9]{4}\.html/$1/) {
|
||||
$_ = encode_utf8($_);
|
||||
rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_");
|
||||
} elsif($_ =~ s/(.+?)\.html/$1/) {
|
||||
$_ = encode_utf8($_);
|
||||
rename( "$pwd/$File::Find::name", "$pwd/$File::Find::dir/$_");
|
||||
} else {
|
||||
print "couldn't find filename pattern in $File::Find::name\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
#DONE: richtiges umbenennen der files - siehe mediawiki/trunk/phase3/maintenance/dumpHTML.inc -> function getFriendlyName
|
71
stats
Normal file
71
stats
Normal file
|
@ -0,0 +1,71 @@
|
|||
==originial p7z:
|
||||
size: 2,124,093
|
||||
dirs: 34125
|
||||
files: 1724658
|
||||
|
||||
==entpackt:
|
||||
size: 26,124,760
|
||||
|
||||
==löschung unnützer dateien:
|
||||
size: 15,884,704
|
||||
files: 1135661
|
||||
|
||||
**davon:
|
||||
kaputt: 116
|
||||
links: 454132
|
||||
|
||||
==hinzufügen der links.list files TODO: dateigröße nach unten
|
||||
#create_linkindex.pl
|
||||
size: 16,004,220
|
||||
files: 1156229
|
||||
|
||||
==löschen der links:
|
||||
#find de/ -size -2k -type f -name '*.html' -delete
|
||||
size: 14,183,948
|
||||
files: 701044
|
||||
|
||||
==extrahieren des inhalts
|
||||
#extract_content.pl
|
||||
size: 8575264
|
||||
|
||||
==whitespaces und kommentare entfernen
|
||||
#clean_whitespaces_comments.pl
|
||||
size: 8509992
|
||||
|
||||
==editsection einträge entfernen
|
||||
#clean_edits.pl
|
||||
size: 8197228
|
||||
|
||||
==title attribute entfernen
|
||||
#clean_titles.pl
|
||||
size: 7400248
|
||||
|
||||
==tex images umwandeln
|
||||
#clean_tex.pl
|
||||
size: 7395216
|
||||
|
||||
==links umwandeln TODO: mit clean titles zusammenlegen
|
||||
#clean_links.pl
|
||||
size: 6774260
|
||||
|
||||
==thumbnail boxen löschen
|
||||
#clean_thumbnails.pl
|
||||
size: 6515720
|
||||
|
||||
==spans, class attr, style attr. löschen
|
||||
#clean_css_markup.pl
|
||||
size: 5995296
|
||||
|
||||
==bilder löschen
|
||||
#clean_images.pl
|
||||
size: 5730456
|
||||
|
||||
==alle umbenennen
|
||||
find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?[[:upper:]]+.*?)(_[a-f0-9]{4})\.html$/$1$2/' --
|
||||
|
||||
find de/ -type f | rename 's/^(.+?\/.{1,2}\/.{1,2}\/.{1,2}\/)(.+?)\.html$/$1$2/' --
|
||||
|
||||
|
||||
in april: 664444 items, totalling 3.5 GB => 1GB
|
||||
701044 => 1.1GB
|
||||
|
7
test.pl
Normal file
7
test.pl
Normal file
|
@ -0,0 +1,7 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
$test = "%6d%69%74%73%75%68%69%6b%6f%40%75%62%75%6e%74%75%2e%63%6f%6d";
|
||||
|
||||
$test =~ s/%([0-9a-f]{2})/chr(hex($1))/eg;
|
||||
|
||||
print $test;
|
9
time
Normal file
9
time
Normal file
|
@ -0,0 +1,9 @@
|
|||
remove 11m
|
||||
extract 110m
|
||||
remove_files 22m
|
||||
create_links -
|
||||
delete_links 12m
|
||||
remove_everyth 118m
|
||||
remove_catlink 31m
|
||||
rename 11m
|
||||
mksquashfs 62m
|
Loading…
Reference in a new issue