initial commit

2014-06-26 14:00:38 +02:00 · 2014-06-26 14:00:38 +02:00 · f142c0068a
commit f142c0068a
4 changed files with 138 additions and 0 deletions
--- a/explosm.pl
+++ b/explosm.pl
@ -0,0 +1,25 @@
+#!/usr/bin/perl
+
+use LWP::Simple;
+use File::Path;
+
+my $base_url = "http://www.explosm.net/comics/";
+
+$next = 942;
+$i = 802;
+mkpath "explosm";
+
+while($next) {
+    my $content = get $base_url.$next or die "cannot access ".$base_url.$next;
+
+    ($date, $author, $next, $img) = $content =~ /<nobr>(\d\d.\d\d.\d\d\d\d) <b>by (?:<a href=\"http:\/\/www.explosm.net\/comics\/author\/[^\"]+\/\">)?([^<]+)(?:<\/a>)?<\/b>.*?<\/nobr>.+?<a href=\"\/comics\/(\d+)\/\">Next ><\/a>.+?<img alt=\"Cyanide and Happiness, a daily webcomic\" src=\"(http:\/\/(?:www.)?explosm.net\/db\/files\/[^\"]+)\">/;
+    
+    print $i, " ", $next, " ", $date, "\n";
+    my $content = get $img or die "cannot access ".$img;
+    open FILE, ">explosm/$i-$date-$author";
+    print FILE $content;
+    close FILE;
+    $i++;
+    $|++;
+}
+
--- a/omdl.py
+++ b/omdl.py
@ -0,0 +1,28 @@
+from urllib import urlretrieve
+import urllib2
+from lxml import etree
+import os
+from sys import argv
+
+base_url = "http://www.onemanga.com"
+
+manga = argv[1]
+
+chapter = argv[2]
+
+path = "%s/%s"%(manga, chapter)
+
+if not os.path.exists(path):
+	os.makedirs(path)
+
+f = urllib2.urlopen("%s/%s/%s/"%(base_url, manga, chapter))
+tree = etree.parse(f, etree.HTMLParser())
+
+firstpage = "%s%s"%(base_url, tree.find(".//div[@id='chapter-cover']//ul/li/a").attrib['href'])
+
+f = urllib2.urlopen(firstpage)
+tree = etree.parse(f, etree.HTMLParser())
+for page in [p.attrib['value'] for p in tree.findall(".//select[@id='id_page_select']/option")]:
+	print manga, chapter, page
+	tree2 = etree.parse(urllib2.urlopen("%s/%s/%s/%s"%(base_url, manga, chapter, page)), etree.HTMLParser())
+	urlretrieve(tree2.find(".//div[@class='one-page']//img[@class='manga-page']").attrib['src'], "%s/%s.jpg"%(path,page))
--- a/oots.pl
+++ b/oots.pl
@ -0,0 +1,29 @@
+#!/usr/bin/perl
+
+use LWP::Simple;
+use XML::LibXML;
+use File::Path;
+
+my $base_url = "http://www.giantitp.com";
+
+$next = 1;
+$i = 590;
+
+mkpath "oots";
+
+while($next) {
+    my $content = get sprintf($base_url."/comics/oots%04d.html", $i) or die "cannot access ".$base_url."/".$url;
+
+    my $parser = XML::LibXML->new();
+    my $doc = $parser->parse_html_string($content); #we are lucky libxml can read html crap
+    my $img = $doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[2]/td/img/\@src")."\n";
+    my $content = get $base_url.$img or die "cannot access ".$img_url;
+    open FILE, ">oots/$i.gif";
+    print FILE $content;
+    close FILE;
+    if ($doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[1]/td/table/tr/td/a[6]/\@href") eq "#") {
+        $next = 0;
+    }
+    $i++;
+}
+
--- a/scraper.pl
+++ b/scraper.pl
@ -0,0 +1,56 @@
+#!/usr/bin/perl
+
+use LWP::Simple;
+use XML::LibXML;
+use File::Path;
+
+my $base_url = "http://www.onemanga.com";
+
+$url = $ARGV[0];
+if(not $url) {
+    print "usage: ./scraper.pl Fairy_Tail\n";
+    exit
+}
+
+my $content = get $base_url."/".$url or die "cannot access ".$base_url."/".$url;
+
+$content =~ s/&/&amp;/g; #clean up this junk...
+
+my $parser = XML::LibXML->new();
+my $doc = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
+
+#iterate through all chapters
+@chapters = reverse($doc->findnodes("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content-main']/table[\@class='ch-table']/tr/td[\@class='ch-subject']/a/\@href"));
+foreach my $node (@chapters) {
+    print "doing ", $node->value, "\n";
+    mkpath ".".$node->value;
+    #get the first page's address
+    $chapter1_url = $base_url.$node->value;
+    $content = get $chapter1_url or die "cannot access ".$chapter1_url;
+    $content =~ s/&/&amp;/g; #clean up this junk...
+    $doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
+    $chapter1_url = $base_url.$doc_chap->findvalue("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content2']/div[\@id='chapter-cover']/ul/li[1]/a/\@href");
+    #get all pages in this chapter
+    $content = get $chapter1_url or die "cannot access ".$chapter1_url;
+    $content =~ s/&/&amp;/g; #clean up this junk...
+    $doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
+    #iterate throug all pages
+    foreach $node_chap ($doc_chap->findnodes("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='chapter-navigation']/select[\@name='page']/option/\@value")) {
+        if(-f ".".$node->value.$node_chap->value.".jpg") {
+            print ".".$node->value.$node_chap->value.".jpg already there\n";
+        } else {
+            $page_url =  $base_url.$node->value.$node_chap->value."/";
+            #get image url
+            $content = get $page_url or die "cannot access ".$page_url;
+            $content =~ s/&/&amp;/g; #clean up this junk...
+            $doc_page = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
+            $img_url = $doc_page->findvalue("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='one-page']/a/img[\@class='manga-page']/\@src");
+            #download image
+            print "saving image $img_url\n";
+            $content = get $img_url or die "cannot access ".$img_url;
+            open FILE, ">.".$node->value.$node_chap->value.".jpg";
+            print FILE $content;
+            close FILE;
+        }
+    }
+}