-
Notifications
You must be signed in to change notification settings - Fork 32
/
Copy pathextract_functions.py
351 lines (301 loc) · 19.7 KB
/
extract_functions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
#coding=utf-8
import os
import time
import re
from collections import Counter
def puncfilter(line):
r1 = u'[’!"#$%&\'()*+,-./:;<=>?@;;:.|~\≧▽—°❄×🍀🐾🍓🐋▲♥♀☀●巜「」☕/↓→<=>?@⁄•ω★💊🙈☕💰😂·、…★、…【】《》『』()?“”‘’![\\]^_`{|}~]+'
line1 = re.sub(r1, '', line)
line2 = line1.replace('\\', '')
return line2
def platformUni(platform):
if 'iPhone' in platform:
iphonelist =[ "iPhone 5s", "iPhone 5c", "iPhone 5", "iPhone 6 Plus", "iPhone 6s Plus", "iPhone 6s","iPhone 6", "iPhone 7 Plus", "iPhone 7","iPhone SE", "iPhone"]
for phone in iphonelist:
if phone in platform:
new = phone
return new
# new = "iPhone" + platform.split('iPhone')[1]
elif 'iOS' in platform:
new = "iPhone"
return new
elif 'Android' in platform:
new = "Android" + platform.split('Android')[1]
return new
elif 'iPad' in platform:
new = "iPad" + platform.split('iPad')[1]
return new
elif '360手机' in platform:
new = "360手机"
return new
elif '魅族' in platform:
new = "魅族" + platform.split('魅族')[1]
return new
elif 'MEIZU' in platform:
new = "魅族" + platform.split('MEIZU')[1]
return new
elif '魅蓝' in platform:
new = "魅族 魅蓝" + platform.split('魅蓝')[1]
return new
elif 'Galaxy' in platform:
new = "三星 Galaxy" + platform.split('Galaxy')[1]
return new
elif 'GALAXY' in platform:
new = "三星 Galaxy" + platform.split('GALAXY')[1]
return new
elif 'Samsung' in platform:
new = "三星" + platform.split('Samsung')[1]
return new
elif '360' in platform:
new = "360" + platform.split('360')[1]
return new
elif '小米' in platform:
new = "小米" + platform.split('小米')[1]
return new
elif '红米' in platform:
new = "小米 红米" + platform.split('红米')[1]
return new
elif 'xiaomi' in platform:
new = "小米" + platform.split('小米')[1]
return new
elif '荣耀' in platform:
new = "华为荣耀" + platform.split('荣耀')[1]
return new
elif 'vivo' in platform:
new = "vivo" + platform.split('vivo')[1]
return new
elif 'HUAWEI' in platform:
new = "华为" + platform.split('HUAWEI')[1]
return new
elif 'OnePlus' in platform:
new = "一加" + platform.split('OnePlus')[1]
return new
elif 'Smartisan' in platform:
new = "锤子" + platform.split('Smartisan')[1]
return new
elif '坚果' in platform:
new = "锤子 坚果" + platform.split('坚果')[1]
return new
elif 'Xperia' in platform:
new = "索尼 Xperia" + platform.split('Xperia')[1]
return new
else:
return platform
def platformSimp(platform):
platformlist = ["iPhone", "iPad", "秒拍", "三星", "华为", "小米", "OPPO", "vivo", "魅族","索尼","锤子","一加","Android"]
for phone in platformlist:
if phone in platform:
return phone
return platform
# print platformUni(p1)
# print platformSimp(p1)
# print platformSimp(p4)
def removepeople(peopleline):
pattern = peopleline.split("//@")
outputline = ""
for name in pattern:
name = name.split(":")[-1]
outputline += name
return outputline
# print str(removepeople(line1)).decode('string_escape')
def removeurl(urlline):
results = re.compile(r'http://[a-zA-Z0-9.?/&=:]*', re.S)
dd = results.sub("", urlline)
return dd
def removeEmoji(text):
# emoji_pattern = re.compile(u'[\uD800-\uDBFF][\uDC00-\uDFFF]')
nomoji = re.compile(u'['
u'\U0001F300-\U0001F5FF'
u'\U0001F600-\U0001F64F'
u'\U0001F680-\U0001F6FF'
u'\u2600-\u26FF\u2700-\u27BF]+',
re.UNICODE)
return nomoji.sub(r'', text) # no emoji
def get_files_from_folder(rootDir):
list_dirs = os.walk(rootDir)
for root, dirs, files in list_dirs:
for f in files:
print f
print "like you:", f
#print os.path.join(root, f)
get_files_from_folder("../WBTestdata/proxy")
def creat_date_list(month,i,j):
dates = []
for n in range(i,j):
date = month + "-" +str(n).zfill(2)
dates.append(date)
return dates
# print creat_date_list("04",01,15)
def process_time(input, starttime):
if "今天" in input:
thisStartTime = time.localtime(float(starttime))
otherStyleTime = str(time.strftime("%Y-%m-%d", thisStartTime))
creatTime = otherStyleTime + " " + input.split(" ")[1]+":00"
return creatTime
elif "分钟前" in input:
creatTime = 60 * float(input.strip("分钟前"))
thisStartTime = time.localtime(float(starttime) - creatTime)
otherStyleTime = str(time.strftime("%Y-%m-%d %H:%M:%S", thisStartTime))
return otherStyleTime
else:
return "2017-"+input+":00"
def generate_insert(month,i,j):
dates = []
for n in range(i, j):
date = month + "-" + str(n).zfill(2)
pla = "load data local infile 'C:/Users/kaidi/Documents/GitHub/WBDatabase/time/" + date+ ".txt' into table wbdata lINES TERMINATED BY '" + r'\r\n'+"';"
dates.append(pla)
return dates
def turn_tags_tostring(sql_result):
outputstring = ""
for row in sql_result:
longlist = row[0].split(" ")
for i in range(len(longlist)-1):
outputstring = outputstring + longlist[i+1] +","
return outputstring
def linear_scale(inputmin,inputmax,outputmin,outputmax,item):
a = (outputmax-outputmin)/float(inputmax-inputmin)
b = outputmax - a*inputmax
output = a*item +b
return output
def wordscounter(text, n):
wordDict = {}
wordlist =text.split(",")
for word in wordlist:
if word in wordDict:
wordDict[word] = wordDict[word] + 1
else:
wordDict[word] = 1
removelist = ["秒拍", "视频", "网页", "分享","全文","链接","00","1","2","3","4","5","6","7","8","9","10","11","12","13","14","15","21","22","23","24","26","27","28","100","25","20","30","40","50","60","70","80"]
for word in removelist:
try:
del wordDict[word]
except Exception:
pass
count = Counter(wordDict)
rank = count.most_common()[:n]
countmax = rank[1][1]
countmin = rank[-1][1]
print "count max", countmax
print "count min", countmin
diclist = []
for item in rank :
rankdic = {}
rankdic['text'] = item[0]
rankdic['size'] = linear_scale(countmin,countmax,10,110,item[1])
print "linear",item[1]," is ", rankdic['size']
diclist.append(rankdic)
diclist[0]['size'] = 120
return diclist
def dot_coor(keywordlist):
points = [["南苏丹", "1547.18017578125", "785.9016723632812"], ["秘鲁", "628.4868774414062", "970.151611328125"],
["布基纳法索", "1270.7194213867188", "756.82666015625"], ["利比亚", "1433.2128295898438", "617.1016540527344"],
["白俄罗斯", "1505.9412231445312", "348.0906066894531"], ["巴基斯坦", "1866.8802490234375", "576.3596038818359"],
["玻利维亚", "731.4933471679688", "1040.8766174316406"], ["科特迪瓦", "1235.5302124023438", "803.7808837890625"],
["阿尔及利亚", "1300.9364624023438", "599.9543609619141"], ["瑞士", "1358.2608032226562", "413.7236022949219"],
["喀麦隆", "1392.3614501953125", "805.1266479492188"], ["马其顿", "1465.7857666015625", "464.7266540527344"],
["博茨瓦纳", "1497.7301635742188", "1100.9960327148438"], ["乌克兰", "1533.9552001953125", "399.18902587890625"],
["肯尼亚", "1617.3302001953125", "878.8703918457031"], ["约旦", "1599.5941772460938", "567.6516418457031"],
["马里", "1250.480224609375", "703.9266357421875"], ["刚果布", "1474.8217163085938", "918.9766235351562"],
["索马里", "1688.9496459960938", "827.5516662597656"], ["阿富汗", "1847.9052124023438", "541.2960815429688"],
["加纳", "1275.3489379882812", "799.95166015625"], ["奥地利", "1397.5364990234375", "405.50164794921875"],
["乌干达", "1567.30517578125", "865.0833435058594"], ["哥伦比亚", "642.3857116699219", "838.04541015625"],
["苏丹", "1547.18017578125", "725.9016723632812"], ["伊拉克", "1654.7052001953125", "548.4210968017578"],
["尼日尔", "1354.8746337890625", "703.5101928710938"], ["拉脱维亚", "1477.6051635742188", "318.1016540527344"],
["罗马尼亚", "1489.2547607421875", "422.4322052001953"], ["赞比亚", "1527.4864501953125", "1009.882568359375"],
["埃塞俄比亚", "1638.6051635742188", "787.8766479492188"], ["危地马拉", "499.53016662597656", "721.7516479492188"],
["苏里南", "792.5490417480469", "839.5322265625"], ["捷克", "1413.8975219726562", "385.37664794921875"],
["乍得", "1447.7052001953125", "724.64404296875"], ["阿尔巴尼亚", "1453.6119384765625", "469.9845886230469"],
["叙利亚", "1611.0051879882812", "532.3348999023438"], ["吉尔吉斯", "1893.4739379882812", "469.32666015625"],
["哥斯达黎加", "547.6864013671875", "783.1156311035156"], ["巴拉圭", "784.1551513671875", "1111.7989501953125"],
["波兰", "1440.8941650390625", "365.10789489746094"], ["纳米比亚", "1445.4052124023438", "1107.0016479492188"],
["南非", "1496.5802001953125", "1162.0579223632812"], ["埃及", "1544.1878662109375", "613.6516418457031"],
["格鲁吉亚", "1638.605224609375", "458.00592041015625"], ["波斯尼亚", "1432.18017578125", "442.62109375"],
["萨尔瓦多", "509.4618682861328", "741.62109375"], ["圭亚那", "766.3301696777344", "830.4266662597656"],
["比利时", "1329.8302001953125", "378.3937072753906"], ["莱索托", "1525.9052124023438", "1173.6563720703125"],
["保加利亚", "1494.4239501953125", "454.1384735107422"], ["布隆迪", "1546.6051635742188", "912.63427734375"],
["吉布提", "1656.5615234375", "761.3627624511719"], ["乌拉圭", "821.9441528320312", "1202.4515991210938"],
["刚果金", "1414.93017578125", "885.2002868652344"], ["卢旺达", "1546.268310546875", "898.1609497070312"],
["亚美尼亚", "1655.2801513671875", "480.0759582519531"], ["塞内加尔", "1159.0552368164062", "734.4845886230469"],
["多哥", "1291.3052368164062", "793.0516357421875"], ["匈牙利", "1445.0857543945312", "410.6592712402344"],
["马拉维", "1583.261474609375", "1010.9766235351562"], ["塔吉克斯坦", "1871.7183227539062", "492.0933532714844"],
["冰岛", "1178.7619018554688", "244.93289184570312"], ["尼加拉瓜", "540.6126098632812", "750.9328918457031"],
["摩洛哥", "1227.1823120117188", "562.7322082519531"], ["利比里亚", "1201.2857666015625", "814.9016418457031"],
["中非", "1467.8302001953125", "812.6016540527344"], ["斯洛伐克", "1446.1369018554688", "396.30165100097656"],
["立陶宛", "1473.5802001953125", "334.2190399169922"], ["津巴布韦", "1537.5450439453125", "1068.1397705078125"],
["以色列", "1580.6739501953125", "566.5016479492188"], ["老挝", "2185.09326171875", "697.6016540527344"],
["朝鲜", "2317.68017578125", "477.95164489746094"], ["土库曼斯坦", "1773.730224609375", "491.3203887939453"],
["贝宁", "1304.5302124023438", "786.4884643554688"], ["斯洛文尼亚", "1410.9052124023438", "420.4122772216797"],
["摩尔多瓦", "1514.4052124023438", "412.702392578125"], ["尼泊尔", "2001.2691650390625", "595.9703979492188"],
["斯威士兰", "1555.2301635742188", "1142.8897705078125"], ["蒙古", "2098.3435668945312", "414.12664794921875"],
["不丹", "2056.3107299804688", "605.6016540527344"], ["法国", "1307.8234252929688", "415.20445251464844"],
["印尼", "2318.830078125", "903.45166015625"], ["也门", "1707.0301513671875", "723.4766540527344"],
["马达加斯加", "1690.3551635742188", "1065.6016540527344"], ["台湾", "2313.6551513671875", "644.1266479492188"],
["墨西哥", "411.55516052246094", "644.1266479492188"], ["阿联酋", "1747.8551635742188", "636.6516723632812"],
["伯利兹", "516.7801666259766", "707.3766479492188"], ["巴西", "807.4107360839844", "1020.2284851074219"],
["塞拉利昂", "1180.9051513671875", "794.7766418457031"], ["意大利", "1393.0801391601562", "462.4266357421875"],
["孟加拉", "2060.0802001953125", "642.9766540527344"], ["多米尼加", "678.93017578125", "691.2766418457031"],
["几内亚比绍", "1152.1551513671875", "761.4266357421875"], ["瑞典", "1421.8302001953125", "270.95164489746094"],
["土耳其", "1578.8051147460938", "491.1766357421875"], ["莫桑比克", "1593.1801147460938", "1064.45166015625"],
["日本", "2399.3302001953125", "532.5766448974609"], ["新西兰", "2652.9051513671875", "1301.9266967773438"],
["古巴", "601.30517578125", "664.82666015625"], ["委内瑞拉", "701.3551635742188", "814.32666015625"],
["葡萄牙", "1226.81982421875", "485.0898132324219"], ["毛里塔尼亚", "1191.8301391601562", "670.0016479492188"],
["安哥拉", "1440.8051147460938", "990.8516540527344"], ["德国", "1375.255126953125", "372.7266540527344"],
["泰国", "2165.8802490234375", "748.7766418457031"], ["澳大利亚", "2418.880126953125", "1146.6766662597656"],
["新几内亚", "2585.6302490234375", "943.1266479492188"], ["克罗地亚", "1422.9801635742188", "436.55165100097656"],
["丹麦", "1383.3051147460938", "325.00164794921875"], ["伊朗", "1736.93017578125", "556.1516418457031"],
["缅甸", "2118.1551513671875", "687.82666015625"], ["芬兰", "1478.755126953125", "246.22664642333984"],
["所罗门群岛", "2698.3302001953125", "970.7266235351562"], ["阿曼", "1768.55517578125", "664.8266296386719"],
["巴拿马", "582.3301391601562", "795.3516540527344"], ["阿根廷", "761.7301635742188", "1258.8016967773438"],
["英国", "1274.05517578125", "332.4766387939453"], ["几内亚", "1184.3551635742188", "779.8266296386719"],
["爱尔兰", "1234.9551391601562", "350.87664794921875"], ["尼日利亚", "1360.30517578125", "789.0266723632812"],
["突尼斯", "1368.3551025390625", "542.3516540527344"], ["坦桑尼亚", "1589.1551513671875", "941.9766540527344"],
["沙特", "1672.5301513671875", "637.8016357421875"], ["越南", "2201.5302734375", "720.0266418457031"],
["俄罗斯", "2004.30517578125", "291.65164947509766"], ["海地", "652.4801635742188", "690.1266479492188"],
["印度", "1992.2301635742188", "668.2766418457031"], ["加拿大", "653.6301574707031", "283.6016616821289"],
["赤道几内亚", "1370.0801391601562", "855.7266235351562"],
["阿塞拜疆", "1675.9801635742188", "479.67665100097656"], ["马来西亚", "2245.80517578125", "835.6016540527344"],
["菲律宾", "2351.605224609375", "751.6516418457031"], ["塞尔维亚", "1458.05517578125", "441.7266540527344"],
["黑山共和国", "1444.8302001953125", "454.37664794921875"],
["爱沙尼亚", "1478.1801147460938", "302.57664489746094"],
["西班牙", "1267.8739013671875", "481.9766540527344"], ["加蓬", "1386.18017578125", "886.7766723632812"],
["柬埔寨", "2199.80517578125", "755.6766357421875"], ["韩国", "2341.8302001953125", "522.8016662597656"],
["洪都拉斯", "534.0301666259766", "732.6766357421875"], ["智利", "700.9551696777344", "1232.3516845703125"],
["荷兰", "1336.1551513671875", "363.5266418457031"], ["斯里兰卡", "1992.2301635742188", "801.1016540527344"],
["希腊", "1486.2301635742188", "497.50164794921875"], ["厄瓜多尔", "597.8551635742188", "896.5516357421875"],
["挪威", "1380.2225341796875", "260.33458709716797"], ["黎巴嫩", "1586.8551635742188", "541.7766723632812"],
["厄立特里亚", "1631.7051391601562", "728.6516723632812"], ["美国", "524.3471527099609", "508.9126739501953"],
["哈萨克斯坦", "1809.3801879882812", "403.20164489746094"],
["乌兹别克斯坦", "1807.6551513671875", "467.6016540527344"], ["斐济", "2831.730224609375", "1051.8016357421875"],
["科威特", "1689.7801513671875", "587.20166015625"], ["东帝汶", "2384.3802490234375", "967.8516540527344"],
["巴哈马", "636.9551391601562", "640.6766662597656"], ["瓦努阿图", "2742.605224609375", "1047.7766418457031"],
["冈比亚", "1151.0051879882812", "744.7516479492188"], ["卡塔尔", "1723.130126953125", "626.3016357421875"],
["牙买加", "615.6801452636719", "697.6016540527344"], ["塞浦路斯", "1565.5801391601562", "530.8516540527344"],
["巴勒斯坦", "1585.130126953125", "561.3266296386719"], ["文莱", "2288.355224609375", "840.20166015625"],
["特立尼达和多巴哥", "747.93017578125", "773.5016479492188"], ["佛得角", "1076.255126953125", "719.4516296386719"],
["萨摩亚", "2921.43017578125", "1013.8516540527344"], ["卢森堡", "1341.9051513671875", "387.1016540527344"],
["科摩罗", "1667.3551635742188", "997.7516479492188"], ["毛里求斯", "1778.9051513671875", "1078.2516479492188"],
["圣多美和普林西比", "1344.2051391601562", "870.1016540527344"],
["多米尼克", "757.1301879882812", "724.0516662597656"], ["汤加", "2878.880126953125", "1088.6016845703125"],
["基里巴斯", "1359.1551361083984", "859.7516479492188"],
["密克罗尼西亚", "2672.4552001953125", "805.7016296386719"], ["巴林", "1715.0802001953125", "615.9516296386719"],
["安道尔", "1303.9551391601562", "456.1016540527344"], ["帕劳", "2459.130126953125", "799.9516296386719"],
["塞舌尔", "1772.5802001953125", "919.5516662597656"], ["安提瓜和巴布达", "755.4051513671875", "709.1016540527344"],
["巴巴多斯", "770.93017578125", "748.2016296386719"],
["圣文森特和格林纳丁斯", "754.2551574707031", "751.6516418457031"],
["圣卢西亚", "748.5051574707031", "737.8516540527344"], ["格林纳达", "743.3301391601562", "759.7016296386719"],
["马耳他", "1409.7551879882812", "522.8016662597656"], ["马尔代夫", "1874.3551635742188", "844.8016662597656"],
["圣基茨和尼维斯", "732.4051818847656", "713.70166015625"], ["马绍尔群岛", "2710.4051513671875", "825.2516479492188"],
["列支敦士登", "1367.2051391601562", "411.25164794921875"], ["圣马力诺", "1391.93017578125", "443.45164489746094"],
["图瓦卢", "2845.5302734375", "959.8016662597656"], ["瑙鲁", "2735.130126953125", "857.4516296386719"],
["摩纳哥", "1348.8051147460938", "443.45164489746094"], ["梵蒂冈", "1391.93017578125", "463.00164794921875"],
["新加坡", "2197.505126953125", "865.5016479492188"]]
coorlist = []
for keyword in keywordlist:
for point in points:
coor = []
if keyword == point[0]:
coor.append(point[1])
coor.append(point[2])
coorlist.append(coor)
return coorlist