initial commit

This commit is contained in:
josch 2014-06-26 14:00:38 +02:00
commit f142c0068a
4 changed files with 138 additions and 0 deletions

25
explosm.pl Executable file
View file

@ -0,0 +1,25 @@
#!/usr/bin/perl
use LWP::Simple;
use File::Path;
my $base_url = "http://www.explosm.net/comics/";
$next = 942;
$i = 802;
mkpath "explosm";
while($next) {
my $content = get $base_url.$next or die "cannot access ".$base_url.$next;
($date, $author, $next, $img) = $content =~ /<nobr>(\d\d.\d\d.\d\d\d\d) <b>by (?:<a href=\"http:\/\/www.explosm.net\/comics\/author\/[^\"]+\/\">)?([^<]+)(?:<\/a>)?<\/b>.*?<\/nobr>.+?<a href=\"\/comics\/(\d+)\/\">Next ><\/a>.+?<img alt=\"Cyanide and Happiness, a daily webcomic\" src=\"(http:\/\/(?:www.)?explosm.net\/db\/files\/[^\"]+)\">/;
print $i, " ", $next, " ", $date, "\n";
my $content = get $img or die "cannot access ".$img;
open FILE, ">explosm/$i-$date-$author";
print FILE $content;
close FILE;
$i++;
$|++;
}

28
omdl.py Normal file
View file

@ -0,0 +1,28 @@
from urllib import urlretrieve
import urllib2
from lxml import etree
import os
from sys import argv
base_url = "http://www.onemanga.com"
manga = argv[1]
chapter = argv[2]
path = "%s/%s"%(manga, chapter)
if not os.path.exists(path):
os.makedirs(path)
f = urllib2.urlopen("%s/%s/%s/"%(base_url, manga, chapter))
tree = etree.parse(f, etree.HTMLParser())
firstpage = "%s%s"%(base_url, tree.find(".//div[@id='chapter-cover']//ul/li/a").attrib['href'])
f = urllib2.urlopen(firstpage)
tree = etree.parse(f, etree.HTMLParser())
for page in [p.attrib['value'] for p in tree.findall(".//select[@id='id_page_select']/option")]:
print manga, chapter, page
tree2 = etree.parse(urllib2.urlopen("%s/%s/%s/%s"%(base_url, manga, chapter, page)), etree.HTMLParser())
urlretrieve(tree2.find(".//div[@class='one-page']//img[@class='manga-page']").attrib['src'], "%s/%s.jpg"%(path,page))

29
oots.pl Executable file
View file

@ -0,0 +1,29 @@
#!/usr/bin/perl
use LWP::Simple;
use XML::LibXML;
use File::Path;
my $base_url = "http://www.giantitp.com";
$next = 1;
$i = 590;
mkpath "oots";
while($next) {
my $content = get sprintf($base_url."/comics/oots%04d.html", $i) or die "cannot access ".$base_url."/".$url;
my $parser = XML::LibXML->new();
my $doc = $parser->parse_html_string($content); #we are lucky libxml can read html crap
my $img = $doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[2]/td/img/\@src")."\n";
my $content = get $base_url.$img or die "cannot access ".$img_url;
open FILE, ">oots/$i.gif";
print FILE $content;
close FILE;
if ($doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[1]/td/table/tr/td/a[6]/\@href") eq "#") {
$next = 0;
}
$i++;
}

56
scraper.pl Executable file
View file

@ -0,0 +1,56 @@
#!/usr/bin/perl
use LWP::Simple;
use XML::LibXML;
use File::Path;
my $base_url = "http://www.onemanga.com";
$url = $ARGV[0];
if(not $url) {
print "usage: ./scraper.pl Fairy_Tail\n";
exit
}
my $content = get $base_url."/".$url or die "cannot access ".$base_url."/".$url;
$content =~ s/&/&amp;/g; #clean up this junk...
my $parser = XML::LibXML->new();
my $doc = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
#iterate through all chapters
@chapters = reverse($doc->findnodes("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content-main']/table[\@class='ch-table']/tr/td[\@class='ch-subject']/a/\@href"));
foreach my $node (@chapters) {
print "doing ", $node->value, "\n";
mkpath ".".$node->value;
#get the first page's address
$chapter1_url = $base_url.$node->value;
$content = get $chapter1_url or die "cannot access ".$chapter1_url;
$content =~ s/&/&amp;/g; #clean up this junk...
$doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
$chapter1_url = $base_url.$doc_chap->findvalue("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content2']/div[\@id='chapter-cover']/ul/li[1]/a/\@href");
#get all pages in this chapter
$content = get $chapter1_url or die "cannot access ".$chapter1_url;
$content =~ s/&/&amp;/g; #clean up this junk...
$doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
#iterate throug all pages
foreach $node_chap ($doc_chap->findnodes("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='chapter-navigation']/select[\@name='page']/option/\@value")) {
if(-f ".".$node->value.$node_chap->value.".jpg") {
print ".".$node->value.$node_chap->value.".jpg already there\n";
} else {
$page_url = $base_url.$node->value.$node_chap->value."/";
#get image url
$content = get $page_url or die "cannot access ".$page_url;
$content =~ s/&/&amp;/g; #clean up this junk...
$doc_page = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
$img_url = $doc_page->findvalue("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='one-page']/a/img[\@class='manga-page']/\@src");
#download image
print "saving image $img_url\n";
$content = get $img_url or die "cannot access ".$img_url;
open FILE, ">.".$node->value.$node_chap->value.".jpg";
print FILE $content;
close FILE;
}
}
}