initial commit
This commit is contained in:
commit
dd2a79722f
11 changed files with 514 additions and 0 deletions
31
create_linkindex.pl
Executable file
31
create_linkindex.pl
Executable file
|
@ -0,0 +1,31 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use File::Find;
|
||||||
|
|
||||||
|
$folder=$ENV{PWD}."/static.bak";
|
||||||
|
|
||||||
|
$links = '';
|
||||||
|
|
||||||
|
find(\&filehandler, $folder);
|
||||||
|
|
||||||
|
sub filehandler {
|
||||||
|
if(-s $_ < 2048) {
|
||||||
|
open(BLUB, $_);
|
||||||
|
@lines = <BLUB>;
|
||||||
|
close(BLUB);
|
||||||
|
if($#lines < 4) {
|
||||||
|
#print "file too small: $File::Find::name\n";
|
||||||
|
} else {
|
||||||
|
if(($href) = $lines[4] =~ /<meta http-equiv=\"Refresh\" content=\"0;url=([^"]+)\" \/>/) {
|
||||||
|
$href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri
|
||||||
|
$links .= "$_ $href\n";
|
||||||
|
} else {
|
||||||
|
print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
open(LIST, ">$folder/links.list");
|
||||||
|
print LIST $links;
|
||||||
|
close(LIST);
|
11
filecoun.pl
Normal file
11
filecoun.pl
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
use strict;
|
||||||
|
use warnings;
|
||||||
|
my $dir = shift;
|
||||||
|
die "Usage: $0 directory" unless defined $dir;
|
||||||
|
opendir DIR, "$dir" or die "Could not open $dir: $!\n";
|
||||||
|
while(my $file = readdir DIR)
|
||||||
|
{
|
||||||
|
unlink "$dir/$file" or print "Could not remove $dir/$file: $! \n";
|
||||||
|
}
|
||||||
|
closedir DIR;
|
55
mediawikipatch.diff
Normal file
55
mediawikipatch.diff
Normal file
|
@ -0,0 +1,55 @@
|
||||||
|
Index: maintenance/dumpHTML.inc
|
||||||
|
===================================================================
|
||||||
|
--- maintenance/dumpHTML.inc (revision 29586)
|
||||||
|
+++ maintenance/dumpHTML.inc (working copy)
|
||||||
|
@@ -24,7 +24,7 @@
|
||||||
|
var $interwiki = true;
|
||||||
|
|
||||||
|
# Depth of HTML directory tree
|
||||||
|
- var $depth = 3;
|
||||||
|
+ var $depth = 0;
|
||||||
|
|
||||||
|
# Directory that commons images are copied into
|
||||||
|
var $sharedStaticDirectory;
|
||||||
|
@@ -835,30 +835,29 @@
|
||||||
|
return 'index.html';
|
||||||
|
}
|
||||||
|
|
||||||
|
- return $this->getHashedDirectory( $title ) . '/' .
|
||||||
|
- $this->getFriendlyName( $dbkey ) . '.html';
|
||||||
|
+ return $this->getFriendlyName( $dbkey );
|
||||||
|
}
|
||||||
|
|
||||||
|
function getFriendlyName( $name ) {
|
||||||
|
global $wgLang;
|
||||||
|
# Replace illegal characters for Windows paths with underscores
|
||||||
|
- $friendlyName = strtr( $name, '/\\*?"<>|~', '_________' );
|
||||||
|
+ $friendlyName = str_replace( '/', '_', $name );
|
||||||
|
|
||||||
|
# Work out lower case form. We assume we're on a system with case-insensitive
|
||||||
|
# filenames, so unless the case is of a special form, we have to disambiguate
|
||||||
|
- if ( function_exists( 'mb_strtolower' ) ) {
|
||||||
|
- $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) );
|
||||||
|
- } else {
|
||||||
|
- $lowerCase = ucfirst( strtolower( $name ) );
|
||||||
|
- }
|
||||||
|
+ #if ( function_exists( 'mb_strtolower' ) ) {
|
||||||
|
+ # $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) );
|
||||||
|
+ #} else {
|
||||||
|
+ # $lowerCase = ucfirst( strtolower( $name ) );
|
||||||
|
+ #}
|
||||||
|
|
||||||
|
# Make it mostly unique
|
||||||
|
- if ( $lowerCase != $friendlyName ) {
|
||||||
|
- $friendlyName .= '_' . substr(md5( $name ), 0, 4);
|
||||||
|
- }
|
||||||
|
+ #if ( $lowerCase != $friendlyName ) {
|
||||||
|
+ # $friendlyName .= '_' . substr(md5( $name ), 0, 4);
|
||||||
|
+ #}
|
||||||
|
# Handle colon specially by replacing it with tilde
|
||||||
|
# Thus we reduce the number of paths with hashes appended
|
||||||
|
- $friendlyName = str_replace( ':', '~', $friendlyName );
|
||||||
|
+ #$friendlyName = str_replace( ':', '~', $friendlyName );
|
||||||
|
|
||||||
|
return $friendlyName;
|
||||||
|
}
|
19
mokopedia.pl
Normal file
19
mokopedia.pl
Normal file
|
@ -0,0 +1,19 @@
|
||||||
|
use IO::Socket;
|
||||||
|
my $sock = new IO::Socket::INET (
|
||||||
|
LocalHost => 'localhost',
|
||||||
|
LocalPort => '7070',
|
||||||
|
Proto => 'tcp',
|
||||||
|
Listen => 1,
|
||||||
|
Reuse => 1,
|
||||||
|
);
|
||||||
|
die "Could not create socket: $!\n" unless $sock;
|
||||||
|
|
||||||
|
while($new_sock = $sock->accept())
|
||||||
|
{
|
||||||
|
$get = <$new_sock>;
|
||||||
|
print $get;
|
||||||
|
print $new_sock "HTTP/1.1 200 OK\n\n";
|
||||||
|
print $new_sock "<b>j0!</b>";
|
||||||
|
close $new_sock;
|
||||||
|
}
|
||||||
|
close $sock;
|
141
mokopedia_logo.svg
Normal file
141
mokopedia_logo.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 42 KiB |
23
remove_everything.pl
Executable file
23
remove_everything.pl
Executable file
|
@ -0,0 +1,23 @@
|
||||||
|
#!/usr/bin/perl -w
|
||||||
|
|
||||||
|
use File::Find;
|
||||||
|
|
||||||
|
$transform = $ENV{PWD}."/transform.xslt";
|
||||||
|
$folder=$ENV{PWD}."/";
|
||||||
|
|
||||||
|
find(\&filehandler, $folder);
|
||||||
|
|
||||||
|
sub filehandler {
|
||||||
|
if(-s $_ > 2048) {
|
||||||
|
$doc = qx/xsltproc --html \Q$transform\E \Q$_\E/;
|
||||||
|
$doc =~ s/^[^\n]*\n//;
|
||||||
|
($title) = $doc =~ /^([^\n]*)\n/;
|
||||||
|
$doc =~ s/^[^\n]*\n//;
|
||||||
|
$doc =~ s/\n//g;
|
||||||
|
$doc =~ s/\s{2,}//g;
|
||||||
|
$doc =~ s/<!--.*?-->//g;
|
||||||
|
#open FILE, ">$_";
|
||||||
|
print "$title\n$doc";
|
||||||
|
#close FILE;
|
||||||
|
}
|
||||||
|
}
|
40
remove_files.sh
Executable file
40
remove_files.sh
Executable file
|
@ -0,0 +1,40 @@
|
||||||
|
#!/bin/bash
|
||||||
|
|
||||||
|
#echo lösche \"Kategorie~*\"
|
||||||
|
#rm "Kategorie~*"
|
||||||
|
echo lösche \"Kategorie_Diskussion~*\"
|
||||||
|
rm "Kategorie_Diskussion~*"
|
||||||
|
echo lösche \"Bild~*\"
|
||||||
|
rm "Bild~*"
|
||||||
|
echo lösche \"Bild_Diskussion~*\"
|
||||||
|
rm "Bild_Diskussion~*"
|
||||||
|
echo lösche \"Portal~*\"
|
||||||
|
rm "Portal~*"
|
||||||
|
echo lösche \"Portal_Diskussion~*\"
|
||||||
|
rm "Portal_Diskussion~*"
|
||||||
|
echo lösche \"Diskussion~*\"
|
||||||
|
rm "Diskussion~*"
|
||||||
|
echo lösche \"Vorlage~*\"
|
||||||
|
rm "Vorlage~*"
|
||||||
|
echo lösche \"Vorlage_Diskussion~*\"
|
||||||
|
rm "Vorlage_Diskussion~*"
|
||||||
|
echo lösche \"Benutzer~*\"
|
||||||
|
rm "Benutzer~*"
|
||||||
|
echo lösche \"Benutzer_Diskussion~*\"
|
||||||
|
rm "Benutzer_Diskussion~*"
|
||||||
|
#echo lösche \"Spezial~*\"
|
||||||
|
#rm "Spezial~*"
|
||||||
|
#echo lösche \"Wikipedia~*\"
|
||||||
|
#rm "Wikipedia~*"
|
||||||
|
echo lösche \"Wikipedia_Diskussion~*\"
|
||||||
|
rm "Wikipedia_Diskussion~*"
|
||||||
|
echo lösche \"MediaWiki~*\"
|
||||||
|
rm "MediaWiki~*"
|
||||||
|
echo lösche \"MediaWiki_Diskussion~*\"
|
||||||
|
rm "MediaWiki_Diskussion~*"
|
||||||
|
echo lösche \"Hilfe~*\"
|
||||||
|
rm "Hilfe~*"
|
||||||
|
echo lösche \"Hilfe_Diskussion~*\"
|
||||||
|
rm "Hilfe_Diskussion~*"
|
||||||
|
echo lösche \"WP~*\"
|
||||||
|
rm "WP~*"
|
11
test.pl
Normal file
11
test.pl
Normal file
|
@ -0,0 +1,11 @@
|
||||||
|
open FILE, "html.lst.3";
|
||||||
|
while($line=<FILE>)
|
||||||
|
{
|
||||||
|
if ($line =~ /aaaaaaaa/)
|
||||||
|
{
|
||||||
|
print $line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
cat html.lst | perl -ne "print unless /\/.{1,9}\/.{1,9}\/.{1,9}\/(Category_Discussion|Image|Image_Discussion|Portal|Portal_Discussion|Diskussion|Template|Template_Discussion|User|User_Discussion|Wikipedia_Discussion|MediaWiki|MediaWiki_Diskussion|Help|Help_Discussion|WP)~/;" - > html.lst.1
|
||||||
|
|
63
timings
Normal file
63
timings
Normal file
|
@ -0,0 +1,63 @@
|
||||||
|
- download database dump
|
||||||
|
wget http://download.wikimedia.org/enwiki/20080103/enwiki-20080103-pages-articles.xml.bz2
|
||||||
|
|
||||||
|
- install a whole LAMP environment with apache2, php5 and mysql5
|
||||||
|
|
||||||
|
- checkout the mediawiki to /var/www/
|
||||||
|
svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/phase3 /var/www
|
||||||
|
|
||||||
|
- remove the extension dir from the checkout
|
||||||
|
|
||||||
|
- checkout the extensions to /var/www/extensions
|
||||||
|
svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/extensions /var/www/extensions
|
||||||
|
|
||||||
|
- patch mediawiki with mediawiki.diff
|
||||||
|
patch -p0 < mediawikipatch.diff
|
||||||
|
|
||||||
|
- configure /etc/apache2/sites-enabled/000-default so that mediawiki loads when you access localhost
|
||||||
|
|
||||||
|
- goto http://localhost and finish the mediawiki install
|
||||||
|
|
||||||
|
- add ParserFunctions Extension by adding
|
||||||
|
require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" );
|
||||||
|
to LocalSettings.php
|
||||||
|
|
||||||
|
- download xml2sql from <url to be added> ./configure and make it
|
||||||
|
|
||||||
|
- convert the xml to sql
|
||||||
|
time cat enwiki.xml | ./xml2sql-0.5/xml2sql -o sqldump/ -v -m
|
||||||
|
|
||||||
|
- import into sql database
|
||||||
|
time mysql -f -u root -p mediawiki < sqldump/page.sql
|
||||||
|
time mysql -f -u root -p mediawiki < sqldump/revision.sql
|
||||||
|
time mysql -f -u root -p mediawiki < sqldump/text.sql
|
||||||
|
|
||||||
|
- dump everything
|
||||||
|
time php maintenance/dumpHTML.php -s <startid> -e <endid>
|
||||||
|
|
||||||
|
apt-get install ocaml imagemagick gs cjk-latex tetex-extra php4-imagick binutils gcc
|
||||||
|
cd math/ && make
|
||||||
|
$wgUseTeX = true;
|
||||||
|
|
||||||
|
putcolumn(&rev_tbl, "NULL", 0);
|
||||||
|
putcolumn(&rev_tbl, "NULL", 0);
|
||||||
|
|
||||||
|
require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" );
|
||||||
|
|
||||||
|
|
||||||
|
downloading: 40m32.418s
|
||||||
|
|
||||||
|
extracting: 7m7.119s
|
||||||
|
|
||||||
|
xml2sql: 15m12.989s
|
||||||
|
time cat ndswiki-20080109-pages-articles.xml | ./xml2sql-0.5/xml2sql -o sqldump.nds/ -v -m
|
||||||
|
|
||||||
|
insert: 119m35.751s 135m12.662s 283m11.183s
|
||||||
|
time mysql -f -u root -p mediawiki < sqldump/page.sql
|
||||||
|
|
||||||
|
dumpHTML:
|
||||||
|
time php maintenance/dumpHTML.php
|
||||||
|
|
||||||
|
removefiles:
|
||||||
|
transform:
|
||||||
|
|
50
transform.xslt
Normal file
50
transform.xslt
Normal file
|
@ -0,0 +1,50 @@
|
||||||
|
<?xml version="1.0" encoding="UTF-8" ?>
|
||||||
|
<xsl:stylesheet version="1.0"
|
||||||
|
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||||
|
>
|
||||||
|
<xsl:template match="/">
|
||||||
|
<xsl:apply-templates select="/html/body/div/div/div/div[@id='bodyContent']" />
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="/html/body/div/div/div/div[@id='bodyContent']"> <!-- extract div -->
|
||||||
|
<xsl:value-of select="/html/body/div/div/div/h1[@class='firstHeading']" /> <!-- insert heading -->
|
||||||
|
<xsl:apply-templates/>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template select="/html/body/div/div/div/div[@id='bodyContent']"> <!-- delete parent div but preserve content -->
|
||||||
|
<xsl:apply-templates/>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="h3[@id='siteSub']" /> <!-- delete siteSub heading -->
|
||||||
|
|
||||||
|
<xsl:template match="span[@id='editsection']" /> <!-- delete editsection -->
|
||||||
|
|
||||||
|
<xsl:template match="span"> <!-- delete span tag but preserve content -->
|
||||||
|
<xsl:apply-templates />
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<!-- replace tex formular images with their alt attribute -->
|
||||||
|
<xsl:template match="img[@class='tex']">
|
||||||
|
<code><xsl:value-of select="./@alt" /></code>
|
||||||
|
</xsl:template>
|
||||||
|
|
||||||
|
<xsl:template match="script" /> <!-- delete script nodes -->
|
||||||
|
<xsl:template match="img" /> <!-- delete img nodes -->
|
||||||
|
|
||||||
|
<!-- delete thumbnail boxes -->
|
||||||
|
<xsl:template match="div[@class='thumb tright']" />
|
||||||
|
<xsl:template match="div[@class='thumb tleft']" />
|
||||||
|
|
||||||
|
<xsl:template match="div[@class='printfooter']" /> <!-- delete footer note -->
|
||||||
|
|
||||||
|
<xsl:template match="@class" /> <!-- delete class attributes -->
|
||||||
|
<xsl:template match="@style" /> <!-- delete style attributes -->
|
||||||
|
<xsl:template match="@title" /> <!-- delete title attributes -->
|
||||||
|
<xsl:template match="@rel" /> <!-- delete rel attributes -->
|
||||||
|
|
||||||
|
<xsl:template match="@*|node()">
|
||||||
|
<xsl:copy>
|
||||||
|
<xsl:apply-templates select="@*|node()"/>
|
||||||
|
</xsl:copy>
|
||||||
|
</xsl:template>
|
||||||
|
</xsl:stylesheet>
|
70
wokopedia.svg
Normal file
70
wokopedia.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 29 KiB |
Loading…
Reference in a new issue