initial commit

2014-06-26 14:00:38 +02:00 · 2014-06-26 14:00:38 +02:00 · f142c0068a
commit f142c0068a
4 changed files with 138 additions and 0 deletions
--- a/explosm.pl
+++ b/explosm.pl
@ -0,0 +1,25 @@
 #!/usr/bin/perl
 use LWP::Simple;
 use File::Path;
 my $base_url = "http://www.explosm.net/comics/";
 $next = 942;
 $i = 802;
 mkpath "explosm";
 while($next) {
    my $content = get $base_url.$next or die "cannot access ".$base_url.$next;
    ($date, $author, $next, $img) = $content =~ /<nobr>(\d\d.\d\d.\d\d\d\d) <b>by (?:<a href=\"http:\/\/www.explosm.net\/comics\/author\/[^\"]+\/\">)?([^<]+)(?:<\/a>)?<\/b>.*?<\/nobr>.+?<a href=\"\/comics\/(\d+)\/\">Next ><\/a>.+?<img alt=\"Cyanide and Happiness, a daily webcomic\" src=\"(http:\/\/(?:www.)?explosm.net\/db\/files\/[^\"]+)\">/;
    print $i, " ", $next, " ", $date, "\n";
    my $content = get $img or die "cannot access ".$img;
    open FILE, ">explosm/$i-$date-$author";
    print FILE $content;
    close FILE;
    $i++;
    $|++;
 }
--- a/omdl.py
+++ b/omdl.py
@ -0,0 +1,28 @@
 from urllib import urlretrieve
 import urllib2
 from lxml import etree
 import os
 from sys import argv
 base_url = "http://www.onemanga.com"
 manga = argv[1]
 chapter = argv[2]
 path = "%s/%s"%(manga, chapter)
 if not os.path.exists(path):
 	os.makedirs(path)
 f = urllib2.urlopen("%s/%s/%s/"%(base_url, manga, chapter))
 tree = etree.parse(f, etree.HTMLParser())
 firstpage = "%s%s"%(base_url, tree.find(".//div[@id='chapter-cover']//ul/li/a").attrib['href'])
 f = urllib2.urlopen(firstpage)
 tree = etree.parse(f, etree.HTMLParser())
 for page in [p.attrib['value'] for p in tree.findall(".//select[@id='id_page_select']/option")]:
 	print manga, chapter, page
 	tree2 = etree.parse(urllib2.urlopen("%s/%s/%s/%s"%(base_url, manga, chapter, page)), etree.HTMLParser())
 	urlretrieve(tree2.find(".//div[@class='one-page']//img[@class='manga-page']").attrib['src'], "%s/%s.jpg"%(path,page))
--- a/oots.pl
+++ b/oots.pl
@ -0,0 +1,29 @@
 #!/usr/bin/perl
 use LWP::Simple;
 use XML::LibXML;
 use File::Path;
 my $base_url = "http://www.giantitp.com";
 $next = 1;
 $i = 590;
 mkpath "oots";
 while($next) {
    my $content = get sprintf($base_url."/comics/oots%04d.html", $i) or die "cannot access ".$base_url."/".$url;
    my $parser = XML::LibXML->new();
    my $doc = $parser->parse_html_string($content); #we are lucky libxml can read html crap
    my $img = $doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[2]/td/img/\@src")."\n";
    my $content = get $base_url.$img or die "cannot access ".$img_url;
    open FILE, ">oots/$i.gif";
    print FILE $content;
    close FILE;
    if ($doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[1]/td/table/tr/td/a[6]/\@href") eq "#") {
        $next = 0;
    }
    $i++;
 }
--- a/scraper.pl
+++ b/scraper.pl
@ -0,0 +1,56 @@
 #!/usr/bin/perl
 use LWP::Simple;
 use XML::LibXML;
 use File::Path;
 my $base_url = "http://www.onemanga.com";
 $url = $ARGV[0];
 if(not $url) {
    print "usage: ./scraper.pl Fairy_Tail\n";
    exit
 }
 my $content = get $base_url."/".$url or die "cannot access ".$base_url."/".$url;
 $content =~ s/&/&amp;/g; #clean up this junk...
 my $parser = XML::LibXML->new();
 my $doc = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
 #iterate through all chapters
@chapters = reverse($doc->findnodes("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content-main']/table[\@class='ch-table']/tr/td[\@class='ch-subject']/a/\@href"));
 foreach my $node (@chapters) {
    print "doing ", $node->value, "\n";
    mkpath ".".$node->value;
    #get the first page's address
    $chapter1_url = $base_url.$node->value;
    $content = get $chapter1_url or die "cannot access ".$chapter1_url;
    $content =~ s/&/&amp;/g; #clean up this junk...
    $doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
    $chapter1_url = $base_url.$doc_chap->findvalue("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content2']/div[\@id='chapter-cover']/ul/li[1]/a/\@href");
    #get all pages in this chapter
    $content = get $chapter1_url or die "cannot access ".$chapter1_url;
    $content =~ s/&/&amp;/g; #clean up this junk...
    $doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
    #iterate throug all pages
    foreach $node_chap ($doc_chap->findnodes("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='chapter-navigation']/select[\@name='page']/option/\@value")) {
        if(-f ".".$node->value.$node_chap->value.".jpg") {
            print ".".$node->value.$node_chap->value.".jpg already there\n";
        } else {
            $page_url =  $base_url.$node->value.$node_chap->value."/";
            #get image url
            $content = get $page_url or die "cannot access ".$page_url;
            $content =~ s/&/&amp;/g; #clean up this junk...
            $doc_page = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
            $img_url = $doc_page->findvalue("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='one-page']/a/img[\@class='manga-page']/\@src");
            #download image
            print "saving image $img_url\n";
            $content = get $img_url or die "cannot access ".$img_url;
            open FILE, ">.".$node->value.$node_chap->value.".jpg";
            print FILE $content;
            close FILE;
        }
    }
 }