-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler
executable file
·113 lines (96 loc) · 3.69 KB
/
crawler
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#!/usr/bin/env php
<?php
require 'vendor/autoload.php';
use \Us\Crawler\Storage\RDBStorage;
use \Us\Crawler\Storage\DummyStorage;
use \Us\Crawler\Engine\PttCrawler;
$board_name = null;
$shortopts = "";
$longopts = array(
"board:",
"sleep-between-list:",
"sleep-between-article:",
"sleep-between-retry:",
"timeout:",
"start-page:",
"start-date:",
"stop-date:",
"stop-on-duplicate:",
"storage:",
"db-username:",
"db-password:",
"db-host:",
"db-port:",
"debug",
"help"
);
$options = getopt($shortopts, $longopts);
if (isset($options['help']) || !array_key_exists('board', $options)) {
$fe = fopen('php://stderr', 'w');
$help_msg = <<<EOF
Usage: php crawler.php --board=<board name> {options}
--board : Board (forum) name. (required, case sensitive)
--sleep-between-list=INT : Seconds to sleep between fetching different index pages. (default: 2)
--sleep-between-article=INT : Seconds to sleep between fetching articles. (default: 2)
--sleep-between-retry=INT : Seconds to sleep when error occurrs. (default: 2)
--timeout=INT : Seconds of the http timeout. (default: 10)
--start-page=INT : On which page the crawler should start crawling (default: latest page)
--start-date=DATE : On which date the crawler should start crawling (format: YYYY-MM-DD, ex. 2014-11-30) (default, {today})
--stop-date=DATE : Stop the program when articles older than the specific date. (format: YYYY-MM-DD, ex. 2014-11-30) (default: {today})
--stop-on-duplicate : Stop crawling when articles are duplicated. (default: true)
--storage=STRING : Available storage: "dummy" and "rdb"
--db-username=STRING : Database username (required when using '--storage=rdb')
--db-password=STRING : Database password
--db-host=STRING : Database host
--db-port=INT : Database connect port
--help : Show this dialog
--debug : Enable debug (not implement yet)
EOF;
fwrite($fe, "$help_msg\n");
fclose($fe);
exit(1);
}
// board name
$board_name = $options['board'];
// storage
$storage = 'dummy';
$Db = null;
if (array_key_exists('storage', $options)) {
$storage = $options['storage'];
}
switch ($storage) {
case 'rdb':
$db_username = $options['db-username'];
$db_password = '';
if (array_key_exists('db-password', $options)) {
$db_password = $options['db-password'];
}
$db_host = '127.0.0.1';
if (array_key_exists('db-host', $options)) {
$db_host = $options['db-host'];
}
$db_port = '3306';
if (array_key_exists('db-port', $options)) {
$db_port = $options['db-port'];
}
$Db = new RDBStorage($db_username, $db_password, $db_host, $db_port);
break;
case 'dummy': // no break
default:
$Db = new DummyStorage();
}
$PttCrawler = new PttCrawler($Db, $board_name);
$PttCrawler->setConfig(
array(
"list_sleep" => isset($options['sleep-between-list']) ? $options['sleep-between-list'] : null,
"article_sleep" => isset($options['sleep-between-article']) ? $options['sleep-between-article'] : null,
"error_sleep" => isset($options['sleep-between-retry']) ? $options['sleep-between-retry'] : null,
"timeout" => isset($options['timeout']) ? $options['timeout'] : null,
"start-date" => isset($options['start-date']) ? $options['start-date'] : null,
"stop-date" => isset($options['stop-date']) ? $options['stop-date'] : null,
"stop-on-duplicate" => (isset($options['stop-on-duplicate']) && $options['stop-on-duplicate'] == true) ? true : false,
"start-page" => isset($options['start-page']) ? $options['start-page'] : null,
)
);
$return_code = $PttCrawler->run();
exit($return_code);