-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget-images.pl
executable file
·125 lines (107 loc) · 2.85 KB
/
get-images.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
#!/usr/bin/env perl
use 5.014;
use strict;
use warnings;
use URI;
use LWP::UserAgent;
use Web::Scraper;
use Data::Dumper;
my $browser = LWP::UserAgent->new( agent =>
'Mozilla/5.0'
.' (Windows; U; Windows NT 6.1; en-US; rv:1.9.2b1)'
.' Gecko/20091014 Firefox/3.6b1 GTB5'
);
my $file_num = 0;
=pod
my ( $start_page_num, $last_page_num ) = @ARGV;
$start_page_num ||= 1;
$last_page_num ||= 1;
=cut
# page scraper
my $page_scrap = scraper {
process "a", 'link[]' => '@href';
};
# bbs scraper
my $bbs_scrap = scraper {
process 'img', 'imglink[]' => '@src';
};
my $page_count = 1;
while (1) {
#for my $current_page_num ( $start_page_num .. $last_page_num ) {
print "current page : $page_count" . "\n";
=pod
my $g_name = sprintf(
'http://comic.naver.com/webtoon/list.nhn?titleId=25455&weekday=tue&page=%s',
$current_page_num,
);
=cut
my $g_name = sprintf(
'http://comics.nate.com/webtoon/detail.php?btno=31337&category'
);
my $links = get_image_links($g_name);
my $first_round;
my $last_round;
$first_round = pop(@{ $links });
$last_round = pop(@{ $links });
print Dumper \$links;
#download($links);
sleep 5;
$page_count++;
}
sub get_image_links {
my $url = shift;
my @links = ();
my $response;
eval { $response = $page_scrap->scrape( URI->new($url) ); };
warn $@ if $@;
my $last_round;
my $first_round;
for my $link ( @{ $response->{link} } ) {
next unless $link =~ /31337&bsno/;
$first_round = $link unless defined($last_round);
$last_round = $link unless defined($last_round);
given ($link) {
when (@links){
}
default {
push @links, "$link";
$last_round = $link if $last_round le $link;
$first_round = $link if $first_round ge $link;
}
}
}
push @links, "$last_round";
push @links, "$first_round";
print Dumper \@links;
return \@links;
}
=pod
sub download {
my $links = shift;
for my $article_link ( @{$links} ) {
my $response;
eval { $response = $bbs_scrap->scrape( URI->new($article_link) ); };
if ($@) {
warn $@;
next;
}
for my $img_link ( @{ $response->{imglink} } ) {
if ( $img_link =~ m|http://dcimg| ) {
print $img_link . "\n";
my $ua = LWP::UserAgent->new();
my $res;
eval { $res = $ua->get($img_link); };
if ($@) {
warn $@;
next;
}
my $file = sprintf 'img_%04d.jpg', ++$file_num;
open my $fh, ">", $file;
binmode $fh;
print $fh $res->content;
close $fh;
}
}
}
}
=cut