This repository has been archived by the owner on Apr 25, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathcrawler-callback.js
116 lines (93 loc) · 2.59 KB
/
crawler-callback.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
const http = require("http")
const fs = require("fs")
const cheerio = require("cheerio")
const iconv = require('iconv-lite')
var domian = 'http://www.3wtu.com'
var config = {
dirPath: __dirname + '/' + 'imagesByCallback/',
interval: 300,
}
// 判断存储目录是否存在,不存在就mkdir
if ( !isExit(config.dirPath) ) {
fs.mkdirSync(config.dirPath)
}
// 网页url形式:http://www.3wtu.com/picture/10.html
for (var i = 10; i < 183; i++) {
(function (index) {
var interval = (index - 10) * config.interval + Math.random() * 100
var url = 'http://www.3wtu.com/picture/' + index + '.html'
setTimeout(function () {
getPicsUrl(url, function(picLink) {
getPicData(picLink, function (picData) {
download(picData.data, picData.name, function (err) {
if (err) {
console.log(err)
} else {
console.log(picData.name + ' downloaded successfully')
}
})
})
})
}, interval)
})(i)
}
// 获取图片的链接与名字
function getPicsUrl(url, callback) {
http.get(url, function(res) {
var chunks = []
res.on("data" ,function(chunk) {
chunks.push(chunk)
})
// 因为编码问题,不可以用 chunks += chunk。详见https://github.com/ashtuchkin/iconv-lite/wiki/Use-Buffers-when-decoding
res.on("end",function() {
// 转编码后的html
var decodedBody = iconv.decode(Buffer.concat(chunks), 'gb2312')
// 服务端版本的JQuery
var $ = cheerio.load(decodedBody, { decodeEntities: false })
// 图片的绝对地址
var pic = domian + $('.detailed-pic img').attr('src')
// 图片名字
var name = $('.detailed-title h4').html()
callback({ url: pic, name: name })
})
})
.on('error', function (e) {
console.log('请求失败:')
console.log(e)
})
}
// 下载图片至本地
function getPicData(pic, callback) {
// 文件类型后缀名
var fileType = pic.url.split('.').pop()
// 命名时带上3位时间戳,降低重名的概率
var diff = new Date().getTime().toString().substring(10)
// 图片路径与名字
var name = config.dirPath + pic.name + '#' + diff + '.' + fileType
// 请求图片数据
http.get(pic.url, function(res) {
var data = ''
res.setEncoding('binary')
res.on('data', function(chunk) {
data += chunk
})
res.on('end', function() {
callback({name: name, data: data})
})
})
.on('error', function(err) {
console.log(err)
})
}
function download(data, name, callback) {
fs.writeFile(name, data, 'binary', callback)
}
// 判断 文件/目录 是否存在 ( 同步的 )
function isExit(path) {
try{
fs.accessSync(path)
}catch(e){
return false
}
return true
}