From dd2a79722f314193880495ca83f2fc65c46a66db Mon Sep 17 00:00:00 2001 From: josch Date: Sat, 21 Jun 2014 15:45:04 +0200 Subject: [PATCH] initial commit --- create_linkindex.pl | 31 ++++++++++ filecoun.pl | 11 ++++ mediawikipatch.diff | 55 +++++++++++++++++ mokopedia.pl | 19 ++++++ mokopedia_logo.svg | 141 +++++++++++++++++++++++++++++++++++++++++++ remove_everything.pl | 23 +++++++ remove_files.sh | 40 ++++++++++++ test.pl | 11 ++++ timings | 63 +++++++++++++++++++ transform.xslt | 50 +++++++++++++++ wokopedia.svg | 70 +++++++++++++++++++++ 11 files changed, 514 insertions(+) create mode 100755 create_linkindex.pl create mode 100644 filecoun.pl create mode 100644 mediawikipatch.diff create mode 100644 mokopedia.pl create mode 100644 mokopedia_logo.svg create mode 100755 remove_everything.pl create mode 100755 remove_files.sh create mode 100644 test.pl create mode 100644 timings create mode 100644 transform.xslt create mode 100644 wokopedia.svg diff --git a/create_linkindex.pl b/create_linkindex.pl new file mode 100755 index 0000000..a1cd3ac --- /dev/null +++ b/create_linkindex.pl @@ -0,0 +1,31 @@ +#!/usr/bin/perl -w + +use File::Find; + +$folder=$ENV{PWD}."/static.bak"; + +$links = ''; + +find(\&filehandler, $folder); + +sub filehandler { + if(-s $_ < 2048) { + open(BLUB, $_); + @lines = ; + close(BLUB); + if($#lines < 4) { + #print "file too small: $File::Find::name\n"; + } else { + if(($href) = $lines[4] =~ //) { + $href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri + $links .= "$_ $href\n"; + } else { + print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n"; + } + } + } +} + +open(LIST, ">$folder/links.list"); +print LIST $links; +close(LIST); diff --git a/filecoun.pl b/filecoun.pl new file mode 100644 index 0000000..1ae6640 --- /dev/null +++ b/filecoun.pl @@ -0,0 +1,11 @@ +#!/usr/bin/perl -w +use strict; +use warnings; +my $dir = shift; +die "Usage: $0 directory" unless defined $dir; +opendir DIR, "$dir" or die "Could not open $dir: $!\n"; +while(my $file = readdir DIR) +{ + unlink "$dir/$file" or print "Could not remove $dir/$file: $! \n"; +} +closedir DIR; diff --git a/mediawikipatch.diff b/mediawikipatch.diff new file mode 100644 index 0000000..6d2834b --- /dev/null +++ b/mediawikipatch.diff @@ -0,0 +1,55 @@ +Index: maintenance/dumpHTML.inc +=================================================================== +--- maintenance/dumpHTML.inc (revision 29586) ++++ maintenance/dumpHTML.inc (working copy) +@@ -24,7 +24,7 @@ + var $interwiki = true; + + # Depth of HTML directory tree +- var $depth = 3; ++ var $depth = 0; + + # Directory that commons images are copied into + var $sharedStaticDirectory; +@@ -835,30 +835,29 @@ + return 'index.html'; + } + +- return $this->getHashedDirectory( $title ) . '/' . +- $this->getFriendlyName( $dbkey ) . '.html'; ++ return $this->getFriendlyName( $dbkey ); + } + + function getFriendlyName( $name ) { + global $wgLang; + # Replace illegal characters for Windows paths with underscores +- $friendlyName = strtr( $name, '/\\*?"<>|~', '_________' ); ++ $friendlyName = str_replace( '/', '_', $name ); + + # Work out lower case form. We assume we're on a system with case-insensitive + # filenames, so unless the case is of a special form, we have to disambiguate +- if ( function_exists( 'mb_strtolower' ) ) { +- $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) ); +- } else { +- $lowerCase = ucfirst( strtolower( $name ) ); +- } ++ #if ( function_exists( 'mb_strtolower' ) ) { ++ # $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) ); ++ #} else { ++ # $lowerCase = ucfirst( strtolower( $name ) ); ++ #} + + # Make it mostly unique +- if ( $lowerCase != $friendlyName ) { +- $friendlyName .= '_' . substr(md5( $name ), 0, 4); +- } ++ #if ( $lowerCase != $friendlyName ) { ++ # $friendlyName .= '_' . substr(md5( $name ), 0, 4); ++ #} + # Handle colon specially by replacing it with tilde + # Thus we reduce the number of paths with hashes appended +- $friendlyName = str_replace( ':', '~', $friendlyName ); ++ #$friendlyName = str_replace( ':', '~', $friendlyName ); + + return $friendlyName; + } diff --git a/mokopedia.pl b/mokopedia.pl new file mode 100644 index 0000000..e90987e --- /dev/null +++ b/mokopedia.pl @@ -0,0 +1,19 @@ +use IO::Socket; +my $sock = new IO::Socket::INET ( + LocalHost => 'localhost', + LocalPort => '7070', + Proto => 'tcp', + Listen => 1, + Reuse => 1, +); +die "Could not create socket: $!\n" unless $sock; + +while($new_sock = $sock->accept()) +{ + $get = <$new_sock>; + print $get; + print $new_sock "HTTP/1.1 200 OK\n\n"; + print $new_sock "j0!"; + close $new_sock; +} +close $sock; diff --git a/mokopedia_logo.svg b/mokopedia_logo.svg new file mode 100644 index 0000000..ff4e249 --- /dev/null +++ b/mokopedia_logo.svg @@ -0,0 +1,141 @@ + + + + + + + + + + + + image/svg+xml + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/remove_everything.pl b/remove_everything.pl new file mode 100755 index 0000000..5adfdf6 --- /dev/null +++ b/remove_everything.pl @@ -0,0 +1,23 @@ +#!/usr/bin/perl -w + +use File::Find; + +$transform = $ENV{PWD}."/transform.xslt"; +$folder=$ENV{PWD}."/"; + +find(\&filehandler, $folder); + +sub filehandler { + if(-s $_ > 2048) { + $doc = qx/xsltproc --html \Q$transform\E \Q$_\E/; + $doc =~ s/^[^\n]*\n//; + ($title) = $doc =~ /^([^\n]*)\n/; + $doc =~ s/^[^\n]*\n//; + $doc =~ s/\n//g; + $doc =~ s/\s{2,}//g; + $doc =~ s///g; + #open FILE, ">$_"; + print "$title\n$doc"; + #close FILE; + } +} diff --git a/remove_files.sh b/remove_files.sh new file mode 100755 index 0000000..d1c57ff --- /dev/null +++ b/remove_files.sh @@ -0,0 +1,40 @@ +#!/bin/bash + +#echo lösche \"Kategorie~*\" +#rm "Kategorie~*" +echo lösche \"Kategorie_Diskussion~*\" +rm "Kategorie_Diskussion~*" +echo lösche \"Bild~*\" +rm "Bild~*" +echo lösche \"Bild_Diskussion~*\" +rm "Bild_Diskussion~*" +echo lösche \"Portal~*\" +rm "Portal~*" +echo lösche \"Portal_Diskussion~*\" +rm "Portal_Diskussion~*" +echo lösche \"Diskussion~*\" +rm "Diskussion~*" +echo lösche \"Vorlage~*\" +rm "Vorlage~*" +echo lösche \"Vorlage_Diskussion~*\" +rm "Vorlage_Diskussion~*" +echo lösche \"Benutzer~*\" +rm "Benutzer~*" +echo lösche \"Benutzer_Diskussion~*\" +rm "Benutzer_Diskussion~*" +#echo lösche \"Spezial~*\" +#rm "Spezial~*" +#echo lösche \"Wikipedia~*\" +#rm "Wikipedia~*" +echo lösche \"Wikipedia_Diskussion~*\" +rm "Wikipedia_Diskussion~*" +echo lösche \"MediaWiki~*\" +rm "MediaWiki~*" +echo lösche \"MediaWiki_Diskussion~*\" +rm "MediaWiki_Diskussion~*" +echo lösche \"Hilfe~*\" +rm "Hilfe~*" +echo lösche \"Hilfe_Diskussion~*\" +rm "Hilfe_Diskussion~*" +echo lösche \"WP~*\" +rm "WP~*" diff --git a/test.pl b/test.pl new file mode 100644 index 0000000..639fd1d --- /dev/null +++ b/test.pl @@ -0,0 +1,11 @@ +open FILE, "html.lst.3"; +while($line=) +{ + if ($line =~ /aaaaaaaa/) + { + print $line; + } +} + +cat html.lst | perl -ne "print unless /\/.{1,9}\/.{1,9}\/.{1,9}\/(Category_Discussion|Image|Image_Discussion|Portal|Portal_Discussion|Diskussion|Template|Template_Discussion|User|User_Discussion|Wikipedia_Discussion|MediaWiki|MediaWiki_Diskussion|Help|Help_Discussion|WP)~/;" - > html.lst.1 + diff --git a/timings b/timings new file mode 100644 index 0000000..413de66 --- /dev/null +++ b/timings @@ -0,0 +1,63 @@ + - download database dump + wget http://download.wikimedia.org/enwiki/20080103/enwiki-20080103-pages-articles.xml.bz2 + + - install a whole LAMP environment with apache2, php5 and mysql5 + + - checkout the mediawiki to /var/www/ + svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/phase3 /var/www + + - remove the extension dir from the checkout + + - checkout the extensions to /var/www/extensions + svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/extensions /var/www/extensions + + - patch mediawiki with mediawiki.diff + patch -p0 < mediawikipatch.diff + + - configure /etc/apache2/sites-enabled/000-default so that mediawiki loads when you access localhost + + - goto http://localhost and finish the mediawiki install + + - add ParserFunctions Extension by adding + require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" ); + to LocalSettings.php + + - download xml2sql from ./configure and make it + + - convert the xml to sql + time cat enwiki.xml | ./xml2sql-0.5/xml2sql -o sqldump/ -v -m + + - import into sql database + time mysql -f -u root -p mediawiki < sqldump/page.sql + time mysql -f -u root -p mediawiki < sqldump/revision.sql + time mysql -f -u root -p mediawiki < sqldump/text.sql + + - dump everything + time php maintenance/dumpHTML.php -s -e + +apt-get install ocaml imagemagick gs cjk-latex tetex-extra php4-imagick binutils gcc +cd math/ && make +$wgUseTeX = true; + + putcolumn(&rev_tbl, "NULL", 0); + putcolumn(&rev_tbl, "NULL", 0); + +require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" ); + + +downloading: 40m32.418s + +extracting: 7m7.119s + +xml2sql: 15m12.989s +time cat ndswiki-20080109-pages-articles.xml | ./xml2sql-0.5/xml2sql -o sqldump.nds/ -v -m + +insert: 119m35.751s 135m12.662s 283m11.183s +time mysql -f -u root -p mediawiki < sqldump/page.sql + +dumpHTML: +time php maintenance/dumpHTML.php + +removefiles: +transform: + diff --git a/transform.xslt b/transform.xslt new file mode 100644 index 0000000..1961a17 --- /dev/null +++ b/transform.xslt @@ -0,0 +1,50 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/wokopedia.svg b/wokopedia.svg new file mode 100644 index 0000000..a9fbe24 --- /dev/null +++ b/wokopedia.svg @@ -0,0 +1,70 @@ + + + + + + + + + + + image/svg+xml + + + + + + + +