initial commit
This commit is contained in:
commit
f142c0068a
4 changed files with 138 additions and 0 deletions
25
explosm.pl
Executable file
25
explosm.pl
Executable file
|
@ -0,0 +1,25 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use LWP::Simple;
|
||||||
|
use File::Path;
|
||||||
|
|
||||||
|
my $base_url = "http://www.explosm.net/comics/";
|
||||||
|
|
||||||
|
$next = 942;
|
||||||
|
$i = 802;
|
||||||
|
mkpath "explosm";
|
||||||
|
|
||||||
|
while($next) {
|
||||||
|
my $content = get $base_url.$next or die "cannot access ".$base_url.$next;
|
||||||
|
|
||||||
|
($date, $author, $next, $img) = $content =~ /<nobr>(\d\d.\d\d.\d\d\d\d) <b>by (?:<a href=\"http:\/\/www.explosm.net\/comics\/author\/[^\"]+\/\">)?([^<]+)(?:<\/a>)?<\/b>.*?<\/nobr>.+?<a href=\"\/comics\/(\d+)\/\">Next ><\/a>.+?<img alt=\"Cyanide and Happiness, a daily webcomic\" src=\"(http:\/\/(?:www.)?explosm.net\/db\/files\/[^\"]+)\">/;
|
||||||
|
|
||||||
|
print $i, " ", $next, " ", $date, "\n";
|
||||||
|
my $content = get $img or die "cannot access ".$img;
|
||||||
|
open FILE, ">explosm/$i-$date-$author";
|
||||||
|
print FILE $content;
|
||||||
|
close FILE;
|
||||||
|
$i++;
|
||||||
|
$|++;
|
||||||
|
}
|
||||||
|
|
28
omdl.py
Normal file
28
omdl.py
Normal file
|
@ -0,0 +1,28 @@
|
||||||
|
from urllib import urlretrieve
|
||||||
|
import urllib2
|
||||||
|
from lxml import etree
|
||||||
|
import os
|
||||||
|
from sys import argv
|
||||||
|
|
||||||
|
base_url = "http://www.onemanga.com"
|
||||||
|
|
||||||
|
manga = argv[1]
|
||||||
|
|
||||||
|
chapter = argv[2]
|
||||||
|
|
||||||
|
path = "%s/%s"%(manga, chapter)
|
||||||
|
|
||||||
|
if not os.path.exists(path):
|
||||||
|
os.makedirs(path)
|
||||||
|
|
||||||
|
f = urllib2.urlopen("%s/%s/%s/"%(base_url, manga, chapter))
|
||||||
|
tree = etree.parse(f, etree.HTMLParser())
|
||||||
|
|
||||||
|
firstpage = "%s%s"%(base_url, tree.find(".//div[@id='chapter-cover']//ul/li/a").attrib['href'])
|
||||||
|
|
||||||
|
f = urllib2.urlopen(firstpage)
|
||||||
|
tree = etree.parse(f, etree.HTMLParser())
|
||||||
|
for page in [p.attrib['value'] for p in tree.findall(".//select[@id='id_page_select']/option")]:
|
||||||
|
print manga, chapter, page
|
||||||
|
tree2 = etree.parse(urllib2.urlopen("%s/%s/%s/%s"%(base_url, manga, chapter, page)), etree.HTMLParser())
|
||||||
|
urlretrieve(tree2.find(".//div[@class='one-page']//img[@class='manga-page']").attrib['src'], "%s/%s.jpg"%(path,page))
|
29
oots.pl
Executable file
29
oots.pl
Executable file
|
@ -0,0 +1,29 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use LWP::Simple;
|
||||||
|
use XML::LibXML;
|
||||||
|
use File::Path;
|
||||||
|
|
||||||
|
my $base_url = "http://www.giantitp.com";
|
||||||
|
|
||||||
|
$next = 1;
|
||||||
|
$i = 590;
|
||||||
|
|
||||||
|
mkpath "oots";
|
||||||
|
|
||||||
|
while($next) {
|
||||||
|
my $content = get sprintf($base_url."/comics/oots%04d.html", $i) or die "cannot access ".$base_url."/".$url;
|
||||||
|
|
||||||
|
my $parser = XML::LibXML->new();
|
||||||
|
my $doc = $parser->parse_html_string($content); #we are lucky libxml can read html crap
|
||||||
|
my $img = $doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[2]/td/img/\@src")."\n";
|
||||||
|
my $content = get $base_url.$img or die "cannot access ".$img_url;
|
||||||
|
open FILE, ">oots/$i.gif";
|
||||||
|
print FILE $content;
|
||||||
|
close FILE;
|
||||||
|
if ($doc->findvalue("//body/table/tr/td/table/tr/td/table/tr/td/table/tr[1]/td/table/tr/td/a[6]/\@href") eq "#") {
|
||||||
|
$next = 0;
|
||||||
|
}
|
||||||
|
$i++;
|
||||||
|
}
|
||||||
|
|
56
scraper.pl
Executable file
56
scraper.pl
Executable file
|
@ -0,0 +1,56 @@
|
||||||
|
#!/usr/bin/perl
|
||||||
|
|
||||||
|
use LWP::Simple;
|
||||||
|
use XML::LibXML;
|
||||||
|
use File::Path;
|
||||||
|
|
||||||
|
my $base_url = "http://www.onemanga.com";
|
||||||
|
|
||||||
|
$url = $ARGV[0];
|
||||||
|
if(not $url) {
|
||||||
|
print "usage: ./scraper.pl Fairy_Tail\n";
|
||||||
|
exit
|
||||||
|
}
|
||||||
|
|
||||||
|
my $content = get $base_url."/".$url or die "cannot access ".$base_url."/".$url;
|
||||||
|
|
||||||
|
$content =~ s/&/&/g; #clean up this junk...
|
||||||
|
|
||||||
|
my $parser = XML::LibXML->new();
|
||||||
|
my $doc = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
|
||||||
|
|
||||||
|
#iterate through all chapters
|
||||||
|
@chapters = reverse($doc->findnodes("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content-main']/table[\@class='ch-table']/tr/td[\@class='ch-subject']/a/\@href"));
|
||||||
|
foreach my $node (@chapters) {
|
||||||
|
print "doing ", $node->value, "\n";
|
||||||
|
mkpath ".".$node->value;
|
||||||
|
#get the first page's address
|
||||||
|
$chapter1_url = $base_url.$node->value;
|
||||||
|
$content = get $chapter1_url or die "cannot access ".$chapter1_url;
|
||||||
|
$content =~ s/&/&/g; #clean up this junk...
|
||||||
|
$doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
|
||||||
|
$chapter1_url = $base_url.$doc_chap->findvalue("//body/div[\@id='wrap']/div[\@id='content']/div[\@id='content2']/div[\@id='chapter-cover']/ul/li[1]/a/\@href");
|
||||||
|
#get all pages in this chapter
|
||||||
|
$content = get $chapter1_url or die "cannot access ".$chapter1_url;
|
||||||
|
$content =~ s/&/&/g; #clean up this junk...
|
||||||
|
$doc_chap = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
|
||||||
|
#iterate throug all pages
|
||||||
|
foreach $node_chap ($doc_chap->findnodes("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='chapter-navigation']/select[\@name='page']/option/\@value")) {
|
||||||
|
if(-f ".".$node->value.$node_chap->value.".jpg") {
|
||||||
|
print ".".$node->value.$node_chap->value.".jpg already there\n";
|
||||||
|
} else {
|
||||||
|
$page_url = $base_url.$node->value.$node_chap->value."/";
|
||||||
|
#get image url
|
||||||
|
$content = get $page_url or die "cannot access ".$page_url;
|
||||||
|
$content =~ s/&/&/g; #clean up this junk...
|
||||||
|
$doc_page = $parser->parse_html_string($content, $parser->recover(2)); #we are lucky libxml can read html crap
|
||||||
|
$img_url = $doc_page->findvalue("//body/div[\@id='wrap2']/div[\@id='content']/div[\@id='content2']/div[\@class='one-page']/a/img[\@class='manga-page']/\@src");
|
||||||
|
#download image
|
||||||
|
print "saving image $img_url\n";
|
||||||
|
$content = get $img_url or die "cannot access ".$img_url;
|
||||||
|
open FILE, ">.".$node->value.$node_chap->value.".jpg";
|
||||||
|
print FILE $content;
|
||||||
|
close FILE;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
Loading…
Reference in a new issue