-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathProImage.php
349 lines (291 loc) · 13.2 KB
/
ProImage.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
<?php
require_once 'YqBase.php';
function clear_unmeaningful_char($title){
$title = str_replace("·", "", $title);
$title = str_replace("?", "", $title);
$title = str_replace("?", "", $title);
$title = str_replace("!", "", $title);
$title = str_replace("!", "", $title);
$title = str_replace(",", "", $title);
$title = str_replace(",", "", $title);
$title = str_replace("。", "", $title);
$title = str_replace(".", "", $title);
$title = str_replace("、", "", $title);
$title = str_replace("“", "", $title);
$title = str_replace("”", "", $title);
$title = str_replace("\"", "", $title);
$title = str_replace("/", "", $title);
$title = str_replace("&", "", $title);
$title = str_replace("=", "", $title);
$title = str_replace(";", "", $title);
$title = str_replace(";", "", $title);
$title = str_replace("_", "", $title);
$title = str_replace("-", "", $title);
$title = str_replace("……", "", $title);
$title = str_replace("——", "", $title);
$title = str_replace("|", "", $title);
$title = str_replace("【", "", $title);
$title = str_replace("】", "", $title);
$title = str_replace("《", "", $title);
$title = str_replace("》", "", $title);
$title = str_replace("(", "", $title);
$title = str_replace(")", "", $title);
$title = str_replace("(", "", $title);
$title = str_replace(")", "", $title);
$title = str_replace("「", "", $title);
$title = str_replace("」", "", $title);
$title = str_replace("<", "", $title);
$title = str_replace(">", "", $title);
$title = str_replace(":", "", $title);
$title = str_replace(":", "", $title);
$title = str_replace("-", "", $title);
$title = str_replace("+", "", $title);
$title = str_replace(" ", "", $title);
$title = str_replace("a", "", $title);
$title = str_replace("b", "", $title);
$title = str_replace("c", "", $title);
$title = str_replace("d", "", $title);
$title = str_replace("e", "", $title);
$title = str_replace("f", "", $title);
$title = str_replace("g", "", $title);
$title = str_replace("h", "", $title);
$title = str_replace("i", "", $title);
$title = str_replace("j", "", $title);
$title = str_replace("k", "", $title);
$title = str_replace("l", "", $title);
$title = str_replace("m", "", $title);
$title = str_replace("n", "", $title);
$title = str_replace("o", "", $title);
$title = str_replace("p", "", $title);
$title = str_replace("q", "", $title);
$title = str_replace("r", "", $title);
$title = str_replace("s", "", $title);
$title = str_replace("t", "", $title);
$title = str_replace("u", "", $title);
$title = str_replace("v", "", $title);
$title = str_replace("w", "", $title);
$title = str_replace("x", "", $title);
$title = str_replace("y", "", $title);
$title = str_replace("z", "", $title);
$title = str_replace("A", "", $title);
$title = str_replace("B", "", $title);
$title = str_replace("C", "", $title);
$title = str_replace("D", "", $title);
$title = str_replace("E", "", $title);
$title = str_replace("F", "", $title);
$title = str_replace("G", "", $title);
$title = str_replace("H", "", $title);
$title = str_replace("I", "", $title);
$title = str_replace("J", "", $title);
$title = str_replace("K", "", $title);
$title = str_replace("L", "", $title);
$title = str_replace("M", "", $title);
$title = str_replace("N", "", $title);
$title = str_replace("O", "", $title);
$title = str_replace("P", "", $title);
$title = str_replace("Q", "", $title);
$title = str_replace("R", "", $title);
$title = str_replace("S", "", $title);
$title = str_replace("T", "", $title);
$title = str_replace("U", "", $title);
$title = str_replace("V", "", $title);
$title = str_replace("W", "", $title);
$title = str_replace("X", "", $title);
$title = str_replace("Y", "", $title);
$title = str_replace("Z", "", $title);
$title = str_replace("0", "", $title);
$title = str_replace("1", "", $title);
$title = str_replace("2", "", $title);
$title = str_replace("3", "", $title);
$title = str_replace("4", "", $title);
$title = str_replace("5", "", $title);
$title = str_replace("6", "", $title);
$title = str_replace("7", "", $title);
$title = str_replace("8", "", $title);
$title = str_replace("9", "", $title);
$title = str_replace("%", "", $title);
$title = str_replace("的", "", $title);
$title = str_replace("了", "", $title);
$title = str_replace("和", "", $title);
$title = str_replace("与", "", $title);
$title = str_replace("或", "", $title);
$title = str_replace("于", "", $title);
$title = str_replace("这", "", $title);
$title = str_replace("那", "", $title);
$title = str_replace("你", "", $title);
$title = str_replace("我", "", $title);
$title = str_replace("们", "", $title);
$title = str_replace("是", "", $title);
$title = str_replace("不", "", $title);
$title = str_replace("在", "", $title);
$title = str_replace("再", "", $title);
$title = str_replace("就", "", $title);
$title = str_replace("为", "", $title);
$title = str_replace("吗", "", $title);
$title = str_replace("啊", "", $title);
$title = str_replace("哪", "", $title);
$title = str_replace("要", "", $title);
$title = str_replace("么", "", $title);
$title = str_replace("什", "", $title);
$title = str_replace("怎", "", $title);
$title = str_replace("还", "", $title);
$title = str_replace("谁", "", $title);
$title = str_replace("没", "", $title);
$title = str_replace("有", "", $title);
$title = str_replace("年", "", $title);
$title = str_replace("月", "", $title);
$title = str_replace("日", "", $title);
$title = str_replace("啥", "", $title);
$title = str_replace("又", "", $title);
$title = str_replace("只", "", $title);
$title = str_replace("为", "", $title);
$title = str_replace("以", "", $title);
$title = str_replace("够", "", $title);
$title = str_replace("更", "", $title);
$title = str_replace("给", "", $title);
$title = str_replace("但", "", $title);
$title = str_replace("而", "", $title);
$title = str_replace("千", "", $title);
$title = str_replace("万", "", $title);
$title = str_replace("亿", "", $title);
$title = str_replace("百", "", $title);
$title = str_replace("元", "", $title);
$title = str_replace("很", "", $title);
$title = str_replace("到", "", $title);
$title = str_replace("无", "", $title);
$title = str_replace("多少", "", $title);
$title = str_replace("如何", "", $title);
$title = str_replace("\n", "", $title);
$title = str_replace("\t", "", $title);
return $title;
}
ini_set("max_execution_time", 2400);
$dbname = 'yiquan';
$host = 'localhost';
$port = '27017';
$user = 'test';
$pwd = 'yiquanTodo';
$mongoClient = new MongoClient("mongodb://{$host}:{$port}",array(
'username'=>$user,
'password'=>$pwd,
'db'=>$dbname
));
$db = $mongoClient->yiquan;
$prosource = $db->Prosource;
$sources = $prosource->find();
//
//找到所有的没有图片和正文的新闻
$uncompleteSeeds = $db->Proseed->find(array('seed_dbWriteTime'=> array ('$gt' => (time()-86400)),'seed_text' => '','seed_completeStatus' => 'uncompleted'));
foreach ($uncompleteSeeds as $key => $seed) {
$seed['seed_completeStatus'] = 'inProcess';
$db->Proseed->save($seed);
$feedurl = $seed['seed_link'];
//$feeds = file_get_contents($feedurl);
$ch = curl_init($feedurl);
curl_setopt($ch, CURLOPT_RETURNTRANSFER, true);
$html = curl_exec($ch);
//echo '<h3>'.$feedurl.'</h3>';
//echo '<h3>'.curl_getinfo($ch,CURLINFO_HTTP_CODE).'</h3>';
//echo '<h3>'.strlen($html).'</h3>';
if (curl_getinfo($ch,CURLINFO_HTTP_CODE) != 0) {
//HTML进行UTF-8转码
$encode = mb_detect_encoding($html, array('ASCII', 'UTF-8', 'GB2312', 'GBK', "EUC-CN", "CP936"));
if ($encode != 'UTF-8') {
//$encode = $encode . "//IGNORE"
$html = iconv($encode, 'UTF-8//IGNORE', $html);
//var_dump($feeds);
$html = str_replace('encoding="gb2312"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="ascii"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="gbk"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="ecu-cn"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="cp936"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="GB2312"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="ASCII"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="GBK"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="EUC-CN"', 'encoding="utf-8"', $html);
$html = str_replace('encoding="CP936"', 'encoding="utf-8"', $html);
}
$html = preg_replace("/[\t\n\r]+/", "", $html);
$html = preg_replace("<script .*? /script>", "", $html);
$html = preg_replace("<link .*? >", "", $html);
$html = preg_replace("<link .*? >", "", $html);
$html = preg_replace("<iframe .*? /iframe>", "", $html);
$source = $db->Prosource->findOne(array('_id' => new MongoId($seed['seed_sourceID'])));
$source_openTag = $source['source_tag'][0];
$source_closeTag = $source['source_tag'][1];
$openTag_pos = strpos($html, $source_openTag);
$closeTag_pos = strpos($html, $source_closeTag);
$cutHTML = mb_substr($html, $openTag_pos,$closeTag_pos-$openTag_pos);
if (isset($source['text_startingTag'])) {
$text_startTag = $source['text_startingTag'];
$startTag_pos = strpos($cutHTML,$text_startTag);
if ($startTag_pos !== false) {
$cutHTML = mb_substr($cutHTML, $startTag_pos);
}
}
if (isset($source['text_closingTag'])) {
$text_endTag = $source['text_closingTag'];
$endTag_pos = strpos($cutHTML,$text_endTag);
if ($endTag_pos !== false) {
$cutHTML = mb_substr($cutHTML,0,$endTag_pos);
}
}
$text = $cutHTML;
$text = str_replace("style=", "", $text);
$text = str_replace("width", "", $text);
$text = str_replace("height", "", $text);
$text = str_replace("font-size", "", $text);
//$text = str_replace("size=", "", $text);
$text = preg_replace("<script.*?/script>", "", $text);
$text = preg_replace("<link.*?>", "", $text);
$text = preg_replace("<iframe.*?/iframe>", "", $text);
$cleanedText = clear_unmeaningful_char($text);
$textLen = mb_strlen($cleanedText,'utf-8');
echo $textLen;
//解析行业
$protext = new Protext;
$parserResult = $protext->parseIndustry($text,strtolower($seed['seed_titleLower']));
$imgPattern = "<(?:img|IMG).*?(?:src|data-url)=\"(.*?)\".*?>";
preg_match_all($imgPattern, $text, $imgResult);
if (count($imgResult[0])>0) {
$imageLink = $imgResult[1][0];
$imgCount = count($imgResult[0]);
}else{
$imageLink = '';
$imgCount = 0;
}
$httpPos = strpos($imageLink, 'http');
if ($imageLink != '' && $httpPos === false) {
$imageLink = $source['source_homeURL'].$imageLink;
}
if ($source['source_name'] == '趋势网') {
$imageLink = str_replace("uploads/../../", "", $imageLink);
}else{
$imageLink = str_replace("../", "", $imageLink);
}
if ($imgCount > 5 ) {
$seed['seed_title'] = $seed['seed_title'].'(多图)';
}
$text = iconv($encode, 'UTF-8//IGNORE', $text);
$seed['seed_text'] = $text;
$seed['seed_textLen'] = $textLen;
$seed['seed_imageLink'] = $imageLink;
$seed['seed_imageCount'] = $imgCount;
$seed['seed_completeStatus'] = 'completed';
$seed['seed_textIndustryWords'] = $parserResult['seed_textIndustryWords'];
foreach ($parserResult['seed_industryParsed'] as $key1 => $industry) {
if (!in_array($industry,$seed['seed_industry'])) {
array_push($seed['seed_industry'],$industry);
}
if (!isset($seed['seed_industryHotness'][$industry])) {
$seed['seed_industryHotness'][$industry] = 0;
}
}
$db->Proseed->save($seed);
echo $seed['seed_source'].','.$seed['seed_title'].','.$seed['seed_imageLink'];
}else{
$seed['seed_completeStatus'] = 'uncompleted';
$db->Proseed->save($seed);
}
}
?>