-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathprocess_crisislex.py
76 lines (66 loc) · 3.11 KB
/
process_crisislex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
"""
Create datasets from CrisisLex datasets
"""
import numpy as np
import collections
import glob
import csv
import re
CrisisLexT26Folder = "./data/CrisisLex/data/CrisisLexT26"
CrisisLexT6Folder = "./data/CrisisLex/data/CrisisLexT6"
CL_data = "./data/CrisisLex/CrisisLex27K.csv"
def clean_line(row):
row = re.sub(r"RT @\S+", "", row)
row = re.sub(r"MT @\S+", "", row)
row = ' '.join(re.sub("(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)", " ", row).split()) # remove hyperlinks
row = row.lower()
row.replace('\t', ' ')
return row
# data = np.loadtxt(CrisisLexT6Folder + "/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv", dtype='str', delimiter=',', skiprows = 1)
if False:
with open(CrisisLexT6Folder + "/2012_Sandy_Hurricane/2012_Sandy_Hurricane-ontopic_offtopic.csv") as f:
on_topic = open(CrisisLexT6Folder + "/2012_Sandy_Hurricane/on-topic.txt", "w")
off_topic = open(CrisisLexT6Folder + "/2012_Sandy_Hurricane/off-topic.txt", "w")
for line in f.readlines()[1:]:
vals = line.strip().split(',')
if len(vals) == 3:
if vals[2] == 'on-topic':
on_topic.write(vals[1].strip('\"') + "\n")
elif vals[2] == 'off-topic':
off_topic.write(vals[1].strip('\"') + "\n")
else:
length = len(vals)
# print length, line
on_topic.close()
off_topic.close()
if False:
all_labeled_data = []
for file in glob.glob(CrisisLexT26Folder + "/*/*labeled.csv"):
with open(file) as f:
for line in f.readlines()[1:]:
vals = line.strip().split(',')
if len(vals) == 5:
all_labeled_data.append('\t'.join([vals[0].strip('\"'), vals[2].strip(),vals[3],vals[4], re.sub(r"\s+", " ", vals[1].strip('\"'))]))
elif len(vals) > 5:
length = len(vals)
all_labeled_data.append('\t'.join([vals[0].strip('\"'), vals[length-3], vals[length-2], vals[length-1], re.sub(r"\s+", " ", ','.join(vals[1:length-3]))]))
print 'Number of training tweets:', len(all_labeled_data)
for file in glob.glob(CrisisLexT26Folder + "/*/*labeled.csv"):
with open(file) as f:
output_name = file.split('\\')[len(file.split('\\')) - 2]
print output_name
output = open(CrisisLexT26Folder + '/' + output_name + '.txt', "w")
for line in f.readlines()[1:]:
vals = line.strip().split(',')
if len(vals) == 5:
if vals[4].strip() == "Related and informative" and vals[2].strip() == "NGOs":
output.write(vals[1].strip('\"') + "\n")
elif len(vals) > 5:
length = len(vals)
if vals[length - 1].strip() == "Related and informative" and vals[length - 3].strip() == "NGOs":
tweet = re.sub(r"\s+", " ", ','.join(vals[1:length - 3])).strip('\"')
output.write(tweet + "\n")
output.close()
# for line in all_labeled_data:
# print line
# np.savetxt(CL_data, all_labeled_data, fmt='%s', delimiter='\t')