initial commit

This commit is contained in:
josch 2014-06-21 15:45:04 +02:00
commit dd2a79722f
11 changed files with 514 additions and 0 deletions

31
create_linkindex.pl Executable file
View file

@ -0,0 +1,31 @@
#!/usr/bin/perl -w
use File::Find;
$folder=$ENV{PWD}."/static.bak";
$links = '';
find(\&filehandler, $folder);
sub filehandler {
if(-s $_ < 2048) {
open(BLUB, $_);
@lines = <BLUB>;
close(BLUB);
if($#lines < 4) {
#print "file too small: $File::Find::name\n";
} else {
if(($href) = $lines[4] =~ /<meta http-equiv=\"Refresh\" content=\"0;url=([^"]+)\" \/>/) {
$href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri
$links .= "$_ $href\n";
} else {
print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n";
}
}
}
}
open(LIST, ">$folder/links.list");
print LIST $links;
close(LIST);

11
filecoun.pl Normal file
View file

@ -0,0 +1,11 @@
#!/usr/bin/perl -w
use strict;
use warnings;
my $dir = shift;
die "Usage: $0 directory" unless defined $dir;
opendir DIR, "$dir" or die "Could not open $dir: $!\n";
while(my $file = readdir DIR)
{
unlink "$dir/$file" or print "Could not remove $dir/$file: $! \n";
}
closedir DIR;

55
mediawikipatch.diff Normal file
View file

@ -0,0 +1,55 @@
Index: maintenance/dumpHTML.inc
===================================================================
--- maintenance/dumpHTML.inc (revision 29586)
+++ maintenance/dumpHTML.inc (working copy)
@@ -24,7 +24,7 @@
var $interwiki = true;
# Depth of HTML directory tree
- var $depth = 3;
+ var $depth = 0;
# Directory that commons images are copied into
var $sharedStaticDirectory;
@@ -835,30 +835,29 @@
return 'index.html';
}
- return $this->getHashedDirectory( $title ) . '/' .
- $this->getFriendlyName( $dbkey ) . '.html';
+ return $this->getFriendlyName( $dbkey );
}
function getFriendlyName( $name ) {
global $wgLang;
# Replace illegal characters for Windows paths with underscores
- $friendlyName = strtr( $name, '/\\*?"<>|~', '_________' );
+ $friendlyName = str_replace( '/', '_', $name );
# Work out lower case form. We assume we're on a system with case-insensitive
# filenames, so unless the case is of a special form, we have to disambiguate
- if ( function_exists( 'mb_strtolower' ) ) {
- $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) );
- } else {
- $lowerCase = ucfirst( strtolower( $name ) );
- }
+ #if ( function_exists( 'mb_strtolower' ) ) {
+ # $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) );
+ #} else {
+ # $lowerCase = ucfirst( strtolower( $name ) );
+ #}
# Make it mostly unique
- if ( $lowerCase != $friendlyName ) {
- $friendlyName .= '_' . substr(md5( $name ), 0, 4);
- }
+ #if ( $lowerCase != $friendlyName ) {
+ # $friendlyName .= '_' . substr(md5( $name ), 0, 4);
+ #}
# Handle colon specially by replacing it with tilde
# Thus we reduce the number of paths with hashes appended
- $friendlyName = str_replace( ':', '~', $friendlyName );
+ #$friendlyName = str_replace( ':', '~', $friendlyName );
return $friendlyName;
}

19
mokopedia.pl Normal file
View file

@ -0,0 +1,19 @@
use IO::Socket;
my $sock = new IO::Socket::INET (
LocalHost => 'localhost',
LocalPort => '7070',
Proto => 'tcp',
Listen => 1,
Reuse => 1,
);
die "Could not create socket: $!\n" unless $sock;
while($new_sock = $sock->accept())
{
$get = <$new_sock>;
print $get;
print $new_sock "HTTP/1.1 200 OK\n\n";
print $new_sock "<b>j0!</b>";
close $new_sock;
}
close $sock;

141
mokopedia_logo.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 42 KiB

23
remove_everything.pl Executable file
View file

@ -0,0 +1,23 @@
#!/usr/bin/perl -w
use File::Find;
$transform = $ENV{PWD}."/transform.xslt";
$folder=$ENV{PWD}."/";
find(\&filehandler, $folder);
sub filehandler {
if(-s $_ > 2048) {
$doc = qx/xsltproc --html \Q$transform\E \Q$_\E/;
$doc =~ s/^[^\n]*\n//;
($title) = $doc =~ /^([^\n]*)\n/;
$doc =~ s/^[^\n]*\n//;
$doc =~ s/\n//g;
$doc =~ s/\s{2,}//g;
$doc =~ s/<!--.*?-->//g;
#open FILE, ">$_";
print "$title\n$doc";
#close FILE;
}
}

40
remove_files.sh Executable file
View file

@ -0,0 +1,40 @@
#!/bin/bash
#echo lösche \"Kategorie~*\"
#rm "Kategorie~*"
echo lösche \"Kategorie_Diskussion~*\"
rm "Kategorie_Diskussion~*"
echo lösche \"Bild~*\"
rm "Bild~*"
echo lösche \"Bild_Diskussion~*\"
rm "Bild_Diskussion~*"
echo lösche \"Portal~*\"
rm "Portal~*"
echo lösche \"Portal_Diskussion~*\"
rm "Portal_Diskussion~*"
echo lösche \"Diskussion~*\"
rm "Diskussion~*"
echo lösche \"Vorlage~*\"
rm "Vorlage~*"
echo lösche \"Vorlage_Diskussion~*\"
rm "Vorlage_Diskussion~*"
echo lösche \"Benutzer~*\"
rm "Benutzer~*"
echo lösche \"Benutzer_Diskussion~*\"
rm "Benutzer_Diskussion~*"
#echo lösche \"Spezial~*\"
#rm "Spezial~*"
#echo lösche \"Wikipedia~*\"
#rm "Wikipedia~*"
echo lösche \"Wikipedia_Diskussion~*\"
rm "Wikipedia_Diskussion~*"
echo lösche \"MediaWiki~*\"
rm "MediaWiki~*"
echo lösche \"MediaWiki_Diskussion~*\"
rm "MediaWiki_Diskussion~*"
echo lösche \"Hilfe~*\"
rm "Hilfe~*"
echo lösche \"Hilfe_Diskussion~*\"
rm "Hilfe_Diskussion~*"
echo lösche \"WP~*\"
rm "WP~*"

11
test.pl Normal file
View file

@ -0,0 +1,11 @@
open FILE, "html.lst.3";
while($line=<FILE>)
{
if ($line =~ /aaaaaaaa/)
{
print $line;
}
}
cat html.lst | perl -ne "print unless /\/.{1,9}\/.{1,9}\/.{1,9}\/(Category_Discussion|Image|Image_Discussion|Portal|Portal_Discussion|Diskussion|Template|Template_Discussion|User|User_Discussion|Wikipedia_Discussion|MediaWiki|MediaWiki_Diskussion|Help|Help_Discussion|WP)~/;" - > html.lst.1

63
timings Normal file
View file

@ -0,0 +1,63 @@
- download database dump
wget http://download.wikimedia.org/enwiki/20080103/enwiki-20080103-pages-articles.xml.bz2
- install a whole LAMP environment with apache2, php5 and mysql5
- checkout the mediawiki to /var/www/
svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/phase3 /var/www
- remove the extension dir from the checkout
- checkout the extensions to /var/www/extensions
svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/extensions /var/www/extensions
- patch mediawiki with mediawiki.diff
patch -p0 < mediawikipatch.diff
- configure /etc/apache2/sites-enabled/000-default so that mediawiki loads when you access localhost
- goto http://localhost and finish the mediawiki install
- add ParserFunctions Extension by adding
require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" );
to LocalSettings.php
- download xml2sql from <url to be added> ./configure and make it
- convert the xml to sql
time cat enwiki.xml | ./xml2sql-0.5/xml2sql -o sqldump/ -v -m
- import into sql database
time mysql -f -u root -p mediawiki < sqldump/page.sql
time mysql -f -u root -p mediawiki < sqldump/revision.sql
time mysql -f -u root -p mediawiki < sqldump/text.sql
- dump everything
time php maintenance/dumpHTML.php -s <startid> -e <endid>
apt-get install ocaml imagemagick gs cjk-latex tetex-extra php4-imagick binutils gcc
cd math/ && make
$wgUseTeX = true;
putcolumn(&rev_tbl, "NULL", 0);
putcolumn(&rev_tbl, "NULL", 0);
require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" );
downloading: 40m32.418s
extracting: 7m7.119s
xml2sql: 15m12.989s
time cat ndswiki-20080109-pages-articles.xml | ./xml2sql-0.5/xml2sql -o sqldump.nds/ -v -m
insert: 119m35.751s 135m12.662s 283m11.183s
time mysql -f -u root -p mediawiki < sqldump/page.sql
dumpHTML:
time php maintenance/dumpHTML.php
removefiles:
transform:

50
transform.xslt Normal file
View file

@ -0,0 +1,50 @@
<?xml version="1.0" encoding="UTF-8" ?>
<xsl:stylesheet version="1.0"
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
>
<xsl:template match="/">
<xsl:apply-templates select="/html/body/div/div/div/div[@id='bodyContent']" />
</xsl:template>
<xsl:template match="/html/body/div/div/div/div[@id='bodyContent']"> <!-- extract div -->
<xsl:value-of select="/html/body/div/div/div/h1[@class='firstHeading']" /> <!-- insert heading -->
<xsl:apply-templates/>
</xsl:template>
<xsl:template select="/html/body/div/div/div/div[@id='bodyContent']"> <!-- delete parent div but preserve content -->
<xsl:apply-templates/>
</xsl:template>
<xsl:template match="h3[@id='siteSub']" /> <!-- delete siteSub heading -->
<xsl:template match="span[@id='editsection']" /> <!-- delete editsection -->
<xsl:template match="span"> <!-- delete span tag but preserve content -->
<xsl:apply-templates />
</xsl:template>
<!-- replace tex formular images with their alt attribute -->
<xsl:template match="img[@class='tex']">
<code><xsl:value-of select="./@alt" /></code>
</xsl:template>
<xsl:template match="script" /> <!-- delete script nodes -->
<xsl:template match="img" /> <!-- delete img nodes -->
<!-- delete thumbnail boxes -->
<xsl:template match="div[@class='thumb tright']" />
<xsl:template match="div[@class='thumb tleft']" />
<xsl:template match="div[@class='printfooter']" /> <!-- delete footer note -->
<xsl:template match="@class" /> <!-- delete class attributes -->
<xsl:template match="@style" /> <!-- delete style attributes -->
<xsl:template match="@title" /> <!-- delete title attributes -->
<xsl:template match="@rel" /> <!-- delete rel attributes -->
<xsl:template match="@*|node()">
<xsl:copy>
<xsl:apply-templates select="@*|node()"/>
</xsl:copy>
</xsl:template>
</xsl:stylesheet>

70
wokopedia.svg Normal file

File diff suppressed because one or more lines are too long

After

Width:  |  Height:  |  Size: 29 KiB