From f142c0068afc1beee45e47c15d44006d8c974b8f Mon Sep 17 00:00:00 2001 From: josch Date: Thu, 26 Jun 2014 14:00:38 +0200 Subject: [PATCH] initial commit --- explosm.pl | 25 ++++++++++++++++++++++++ omdl.py | 28 +++++++++++++++++++++++++++ oots.pl | 29 ++++++++++++++++++++++++++++ scraper.pl | 56 ++++++++++++++++++++++++++++++++++++++++++++++++++++++ 4 files changed, 138 insertions(+) create mode 100755 explosm.pl create mode 100644 omdl.py create mode 100755 oots.pl create mode 100755 scraper.pl diff --git a/explosm.pl b/explosm.pl new file mode 100755 index 0000000..d1e71e5 --- /dev/null +++ b/explosm.pl @@ -0,0 +1,25 @@ +#!/usr/bin/perl + +use LWP::Simple; +use File::Path; + +my $base_url = "http://www.explosm.net/comics/"; + +$next = 942; +$i = 802; +mkpath "explosm"; + +while($next) { + my $content = get $base_url.$next or die "cannot access ".$base_url.$next; + + ($date, $author, $next, $img) = $content =~ /(\d\d.\d\d.\d\d\d\d) by (?:)?([^<]+)(?:<\/a>)?<\/b>.*?<\/nobr>.+?Next ><\/a>.+?\"Cyanide/; + + print $i, " ", $next, " ", $date, "\n"; + my $content = get $img or die "cannot access ".$img; + open FILE, ">explosm/$i-$date-$author"; + print FILE $content; + close FILE; + $i++; + $|++; +} + diff --git a/omdl.py b/omdl.py new file mode 100644 index 0000000..055b58a --- /dev/null +++ b/omdl.py @@ -0,0 +1,28 @@ +from urllib import urlretrieve +import urllib2 +from lxml import etree +import os +from sys import argv + +base_url = "http://www.onemanga.com" + +manga = argv[1] + +chapter = argv[2] + +path = "%s/%s"%(manga, chapter) + +if not os.path.exists(path): + os.makedirs(path) + +f = urllib2.urlopen("%s/%s/%s/"%(base_url, manga, chapter)) +tree = etree.parse(f, etree.HTMLParser()) + +firstpage = "%s%s"%(base_url, tree.find(".//div[@id='chapter-cover']//ul/li/a").attrib['href']) + +f = urllib2.urlopen(firstpage) +tree = etree.parse(f, etree.HTMLParser()) +for page in [p.attrib['value'] for p in tree.findall(".//select[@id='id_page_select']/option")]: + print manga, chapter, page + tree2 = etree.parse(urllib2.urlopen("%s/%s/%s/%s"%(base_url, manga, chapter, page)), etree.HTMLParser()) + urlretrieve(tree2.find(".//div[@class='one-page']//img[@class='manga-page']").attrib['src'], "%s/%s.jpg"%(path,page)) diff --git a/oots.pl b/oots.pl new file mode 100755 index 0000000..345cfb2 --- /dev/null +++ b/oots.pl @@ -0,0 +1,29 @@ +#!/usr/bin/perl + +use LWP::Simple; +use XML::LibXML; +use File::Path; + +my $base_url = "http://www.giantitp.com"; + +$next = 1; +$i = 590; + +mkpath "oots"; + +while($next) { + my $content = get sprintf($base_url."/comics/oots%04d.html", $i) or die "cannot access ".$base_url."/".$url; + + my $parser = XML::LibXML->new(); + my $doc = $parser->parse_html_string($content); #we are lucky libxml can read html crap + my $img = $doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[2]/td/img/\@src")."\n"; + my $content = get $base_url.$img or die "cannot access ".$img_url; + open FILE, ">oots/$i.gif"; + print FILE $content; + close FILE; + if ($doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[1]/td/table/tr/td/a[6]/\@href") eq "#") { + $next = 0; + } + $i++; +} + diff --git a/scraper.pl b/scraper.pl new file mode 100755 index 0000000..4b2530d --- /dev/null +++ b/scraper.pl @@ -0,0 +1,56 @@ +#!/usr/bin/perl + +use LWP::Simple; +use XML::LibXML; +use File::Path; + +my $base_url = "http://www.onemanga.com"; + +$url = $ARGV[0]; +if(not $url) { + print "usage: ./scraper.pl Fairy_Tail\n"; + exit +} + +my $content = get $base_url."/".$url or die "cannot access ".$base_url."/".$url; + +$content =~ s/&/&/g; #clean up this junk... + +my $parser = XML::LibXML->new(); +my $doc = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap + +#iterate through all chapters +@chapters = reverse($doc->findnodes("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content-main']/table[\@class='ch-table']/tr/td[\@class='ch-subject']/a/\@href")); +foreach my $node (@chapters) { + print "doing ", $node->value, "\n"; + mkpath ".".$node->value; + #get the first page's address + $chapter1_url = $base_url.$node->value; + $content = get $chapter1_url or die "cannot access ".$chapter1_url; + $content =~ s/&/&/g; #clean up this junk... + $doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap + $chapter1_url = $base_url.$doc_chap->findvalue("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content2']/div[\@id='chapter-cover']/ul/li[1]/a/\@href"); + #get all pages in this chapter + $content = get $chapter1_url or die "cannot access ".$chapter1_url; + $content =~ s/&/&/g; #clean up this junk... + $doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap + #iterate throug all pages + foreach $node_chap ($doc_chap->findnodes("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='chapter-navigation']/select[\@name='page']/option/\@value")) { + if(-f ".".$node->value.$node_chap->value.".jpg") { + print ".".$node->value.$node_chap->value.".jpg already there\n"; + } else { + $page_url = $base_url.$node->value.$node_chap->value."/"; + #get image url + $content = get $page_url or die "cannot access ".$page_url; + $content =~ s/&/&/g; #clean up this junk... + $doc_page = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap + $img_url = $doc_page->findvalue("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='one-page']/a/img[\@class='manga-page']/\@src"); + #download image + print "saving image $img_url\n"; + $content = get $img_url or die "cannot access ".$img_url; + open FILE, ">.".$node->value.$node_chap->value.".jpg"; + print FILE $content; + close FILE; + } + } +}