initial commit
This commit is contained in:
commit
dd2a79722f
11 changed files with 514 additions and 0 deletions
31
create_linkindex.pl
Executable file
31
create_linkindex.pl
Executable file
|
@ -0,0 +1,31 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use File::Find;
|
||||
|
||||
$folder=$ENV{PWD}."/static.bak";
|
||||
|
||||
$links = '';
|
||||
|
||||
find(\&filehandler, $folder);
|
||||
|
||||
sub filehandler {
|
||||
if(-s $_ < 2048) {
|
||||
open(BLUB, $_);
|
||||
@lines = <BLUB>;
|
||||
close(BLUB);
|
||||
if($#lines < 4) {
|
||||
#print "file too small: $File::Find::name\n";
|
||||
} else {
|
||||
if(($href) = $lines[4] =~ /<meta http-equiv=\"Refresh\" content=\"0;url=([^"]+)\" \/>/) {
|
||||
$href =~ s/%([0-9A-F]{2})/chr(hex($1))/eg; #clean uri
|
||||
$links .= "$_ $href\n";
|
||||
} else {
|
||||
print "no match in $File::Find::name\n\$lines[4]: $lines[4]\n\n";
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
open(LIST, ">$folder/links.list");
|
||||
print LIST $links;
|
||||
close(LIST);
|
11
filecoun.pl
Normal file
11
filecoun.pl
Normal file
|
@ -0,0 +1,11 @@
|
|||
#!/usr/bin/perl -w
|
||||
use strict;
|
||||
use warnings;
|
||||
my $dir = shift;
|
||||
die "Usage: $0 directory" unless defined $dir;
|
||||
opendir DIR, "$dir" or die "Could not open $dir: $!\n";
|
||||
while(my $file = readdir DIR)
|
||||
{
|
||||
unlink "$dir/$file" or print "Could not remove $dir/$file: $! \n";
|
||||
}
|
||||
closedir DIR;
|
55
mediawikipatch.diff
Normal file
55
mediawikipatch.diff
Normal file
|
@ -0,0 +1,55 @@
|
|||
Index: maintenance/dumpHTML.inc
|
||||
===================================================================
|
||||
--- maintenance/dumpHTML.inc (revision 29586)
|
||||
+++ maintenance/dumpHTML.inc (working copy)
|
||||
@@ -24,7 +24,7 @@
|
||||
var $interwiki = true;
|
||||
|
||||
# Depth of HTML directory tree
|
||||
- var $depth = 3;
|
||||
+ var $depth = 0;
|
||||
|
||||
# Directory that commons images are copied into
|
||||
var $sharedStaticDirectory;
|
||||
@@ -835,30 +835,29 @@
|
||||
return 'index.html';
|
||||
}
|
||||
|
||||
- return $this->getHashedDirectory( $title ) . '/' .
|
||||
- $this->getFriendlyName( $dbkey ) . '.html';
|
||||
+ return $this->getFriendlyName( $dbkey );
|
||||
}
|
||||
|
||||
function getFriendlyName( $name ) {
|
||||
global $wgLang;
|
||||
# Replace illegal characters for Windows paths with underscores
|
||||
- $friendlyName = strtr( $name, '/\\*?"<>|~', '_________' );
|
||||
+ $friendlyName = str_replace( '/', '_', $name );
|
||||
|
||||
# Work out lower case form. We assume we're on a system with case-insensitive
|
||||
# filenames, so unless the case is of a special form, we have to disambiguate
|
||||
- if ( function_exists( 'mb_strtolower' ) ) {
|
||||
- $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) );
|
||||
- } else {
|
||||
- $lowerCase = ucfirst( strtolower( $name ) );
|
||||
- }
|
||||
+ #if ( function_exists( 'mb_strtolower' ) ) {
|
||||
+ # $lowerCase = $wgLang->ucfirst( mb_strtolower( $name ) );
|
||||
+ #} else {
|
||||
+ # $lowerCase = ucfirst( strtolower( $name ) );
|
||||
+ #}
|
||||
|
||||
# Make it mostly unique
|
||||
- if ( $lowerCase != $friendlyName ) {
|
||||
- $friendlyName .= '_' . substr(md5( $name ), 0, 4);
|
||||
- }
|
||||
+ #if ( $lowerCase != $friendlyName ) {
|
||||
+ # $friendlyName .= '_' . substr(md5( $name ), 0, 4);
|
||||
+ #}
|
||||
# Handle colon specially by replacing it with tilde
|
||||
# Thus we reduce the number of paths with hashes appended
|
||||
- $friendlyName = str_replace( ':', '~', $friendlyName );
|
||||
+ #$friendlyName = str_replace( ':', '~', $friendlyName );
|
||||
|
||||
return $friendlyName;
|
||||
}
|
19
mokopedia.pl
Normal file
19
mokopedia.pl
Normal file
|
@ -0,0 +1,19 @@
|
|||
use IO::Socket;
|
||||
my $sock = new IO::Socket::INET (
|
||||
LocalHost => 'localhost',
|
||||
LocalPort => '7070',
|
||||
Proto => 'tcp',
|
||||
Listen => 1,
|
||||
Reuse => 1,
|
||||
);
|
||||
die "Could not create socket: $!\n" unless $sock;
|
||||
|
||||
while($new_sock = $sock->accept())
|
||||
{
|
||||
$get = <$new_sock>;
|
||||
print $get;
|
||||
print $new_sock "HTTP/1.1 200 OK\n\n";
|
||||
print $new_sock "<b>j0!</b>";
|
||||
close $new_sock;
|
||||
}
|
||||
close $sock;
|
141
mokopedia_logo.svg
Normal file
141
mokopedia_logo.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 42 KiB |
23
remove_everything.pl
Executable file
23
remove_everything.pl
Executable file
|
@ -0,0 +1,23 @@
|
|||
#!/usr/bin/perl -w
|
||||
|
||||
use File::Find;
|
||||
|
||||
$transform = $ENV{PWD}."/transform.xslt";
|
||||
$folder=$ENV{PWD}."/";
|
||||
|
||||
find(\&filehandler, $folder);
|
||||
|
||||
sub filehandler {
|
||||
if(-s $_ > 2048) {
|
||||
$doc = qx/xsltproc --html \Q$transform\E \Q$_\E/;
|
||||
$doc =~ s/^[^\n]*\n//;
|
||||
($title) = $doc =~ /^([^\n]*)\n/;
|
||||
$doc =~ s/^[^\n]*\n//;
|
||||
$doc =~ s/\n//g;
|
||||
$doc =~ s/\s{2,}//g;
|
||||
$doc =~ s/<!--.*?-->//g;
|
||||
#open FILE, ">$_";
|
||||
print "$title\n$doc";
|
||||
#close FILE;
|
||||
}
|
||||
}
|
40
remove_files.sh
Executable file
40
remove_files.sh
Executable file
|
@ -0,0 +1,40 @@
|
|||
#!/bin/bash
|
||||
|
||||
#echo lösche \"Kategorie~*\"
|
||||
#rm "Kategorie~*"
|
||||
echo lösche \"Kategorie_Diskussion~*\"
|
||||
rm "Kategorie_Diskussion~*"
|
||||
echo lösche \"Bild~*\"
|
||||
rm "Bild~*"
|
||||
echo lösche \"Bild_Diskussion~*\"
|
||||
rm "Bild_Diskussion~*"
|
||||
echo lösche \"Portal~*\"
|
||||
rm "Portal~*"
|
||||
echo lösche \"Portal_Diskussion~*\"
|
||||
rm "Portal_Diskussion~*"
|
||||
echo lösche \"Diskussion~*\"
|
||||
rm "Diskussion~*"
|
||||
echo lösche \"Vorlage~*\"
|
||||
rm "Vorlage~*"
|
||||
echo lösche \"Vorlage_Diskussion~*\"
|
||||
rm "Vorlage_Diskussion~*"
|
||||
echo lösche \"Benutzer~*\"
|
||||
rm "Benutzer~*"
|
||||
echo lösche \"Benutzer_Diskussion~*\"
|
||||
rm "Benutzer_Diskussion~*"
|
||||
#echo lösche \"Spezial~*\"
|
||||
#rm "Spezial~*"
|
||||
#echo lösche \"Wikipedia~*\"
|
||||
#rm "Wikipedia~*"
|
||||
echo lösche \"Wikipedia_Diskussion~*\"
|
||||
rm "Wikipedia_Diskussion~*"
|
||||
echo lösche \"MediaWiki~*\"
|
||||
rm "MediaWiki~*"
|
||||
echo lösche \"MediaWiki_Diskussion~*\"
|
||||
rm "MediaWiki_Diskussion~*"
|
||||
echo lösche \"Hilfe~*\"
|
||||
rm "Hilfe~*"
|
||||
echo lösche \"Hilfe_Diskussion~*\"
|
||||
rm "Hilfe_Diskussion~*"
|
||||
echo lösche \"WP~*\"
|
||||
rm "WP~*"
|
11
test.pl
Normal file
11
test.pl
Normal file
|
@ -0,0 +1,11 @@
|
|||
open FILE, "html.lst.3";
|
||||
while($line=<FILE>)
|
||||
{
|
||||
if ($line =~ /aaaaaaaa/)
|
||||
{
|
||||
print $line;
|
||||
}
|
||||
}
|
||||
|
||||
cat html.lst | perl -ne "print unless /\/.{1,9}\/.{1,9}\/.{1,9}\/(Category_Discussion|Image|Image_Discussion|Portal|Portal_Discussion|Diskussion|Template|Template_Discussion|User|User_Discussion|Wikipedia_Discussion|MediaWiki|MediaWiki_Diskussion|Help|Help_Discussion|WP)~/;" - > html.lst.1
|
||||
|
63
timings
Normal file
63
timings
Normal file
|
@ -0,0 +1,63 @@
|
|||
- download database dump
|
||||
wget http://download.wikimedia.org/enwiki/20080103/enwiki-20080103-pages-articles.xml.bz2
|
||||
|
||||
- install a whole LAMP environment with apache2, php5 and mysql5
|
||||
|
||||
- checkout the mediawiki to /var/www/
|
||||
svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/phase3 /var/www
|
||||
|
||||
- remove the extension dir from the checkout
|
||||
|
||||
- checkout the extensions to /var/www/extensions
|
||||
svn checkout http://svn.wikimedia.org/svnroot/mediawiki/trunk/extensions /var/www/extensions
|
||||
|
||||
- patch mediawiki with mediawiki.diff
|
||||
patch -p0 < mediawikipatch.diff
|
||||
|
||||
- configure /etc/apache2/sites-enabled/000-default so that mediawiki loads when you access localhost
|
||||
|
||||
- goto http://localhost and finish the mediawiki install
|
||||
|
||||
- add ParserFunctions Extension by adding
|
||||
require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" );
|
||||
to LocalSettings.php
|
||||
|
||||
- download xml2sql from <url to be added> ./configure and make it
|
||||
|
||||
- convert the xml to sql
|
||||
time cat enwiki.xml | ./xml2sql-0.5/xml2sql -o sqldump/ -v -m
|
||||
|
||||
- import into sql database
|
||||
time mysql -f -u root -p mediawiki < sqldump/page.sql
|
||||
time mysql -f -u root -p mediawiki < sqldump/revision.sql
|
||||
time mysql -f -u root -p mediawiki < sqldump/text.sql
|
||||
|
||||
- dump everything
|
||||
time php maintenance/dumpHTML.php -s <startid> -e <endid>
|
||||
|
||||
apt-get install ocaml imagemagick gs cjk-latex tetex-extra php4-imagick binutils gcc
|
||||
cd math/ && make
|
||||
$wgUseTeX = true;
|
||||
|
||||
putcolumn(&rev_tbl, "NULL", 0);
|
||||
putcolumn(&rev_tbl, "NULL", 0);
|
||||
|
||||
require_once( "$IP/extensions/ParserFunctions/ParserFunctions.php" );
|
||||
|
||||
|
||||
downloading: 40m32.418s
|
||||
|
||||
extracting: 7m7.119s
|
||||
|
||||
xml2sql: 15m12.989s
|
||||
time cat ndswiki-20080109-pages-articles.xml | ./xml2sql-0.5/xml2sql -o sqldump.nds/ -v -m
|
||||
|
||||
insert: 119m35.751s 135m12.662s 283m11.183s
|
||||
time mysql -f -u root -p mediawiki < sqldump/page.sql
|
||||
|
||||
dumpHTML:
|
||||
time php maintenance/dumpHTML.php
|
||||
|
||||
removefiles:
|
||||
transform:
|
||||
|
50
transform.xslt
Normal file
50
transform.xslt
Normal file
|
@ -0,0 +1,50 @@
|
|||
<?xml version="1.0" encoding="UTF-8" ?>
|
||||
<xsl:stylesheet version="1.0"
|
||||
xmlns:xsl="http://www.w3.org/1999/XSL/Transform"
|
||||
>
|
||||
<xsl:template match="/">
|
||||
<xsl:apply-templates select="/html/body/div/div/div/div[@id='bodyContent']" />
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="/html/body/div/div/div/div[@id='bodyContent']"> <!-- extract div -->
|
||||
<xsl:value-of select="/html/body/div/div/div/h1[@class='firstHeading']" /> <!-- insert heading -->
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template select="/html/body/div/div/div/div[@id='bodyContent']"> <!-- delete parent div but preserve content -->
|
||||
<xsl:apply-templates/>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="h3[@id='siteSub']" /> <!-- delete siteSub heading -->
|
||||
|
||||
<xsl:template match="span[@id='editsection']" /> <!-- delete editsection -->
|
||||
|
||||
<xsl:template match="span"> <!-- delete span tag but preserve content -->
|
||||
<xsl:apply-templates />
|
||||
</xsl:template>
|
||||
|
||||
<!-- replace tex formular images with their alt attribute -->
|
||||
<xsl:template match="img[@class='tex']">
|
||||
<code><xsl:value-of select="./@alt" /></code>
|
||||
</xsl:template>
|
||||
|
||||
<xsl:template match="script" /> <!-- delete script nodes -->
|
||||
<xsl:template match="img" /> <!-- delete img nodes -->
|
||||
|
||||
<!-- delete thumbnail boxes -->
|
||||
<xsl:template match="div[@class='thumb tright']" />
|
||||
<xsl:template match="div[@class='thumb tleft']" />
|
||||
|
||||
<xsl:template match="div[@class='printfooter']" /> <!-- delete footer note -->
|
||||
|
||||
<xsl:template match="@class" /> <!-- delete class attributes -->
|
||||
<xsl:template match="@style" /> <!-- delete style attributes -->
|
||||
<xsl:template match="@title" /> <!-- delete title attributes -->
|
||||
<xsl:template match="@rel" /> <!-- delete rel attributes -->
|
||||
|
||||
<xsl:template match="@*|node()">
|
||||
<xsl:copy>
|
||||
<xsl:apply-templates select="@*|node()"/>
|
||||
</xsl:copy>
|
||||
</xsl:template>
|
||||
</xsl:stylesheet>
|
70
wokopedia.svg
Normal file
70
wokopedia.svg
Normal file
File diff suppressed because one or more lines are too long
After Width: | Height: | Size: 29 KiB |
Loading…
Reference in a new issue