-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathprepare_data.py
94 lines (83 loc) · 3.18 KB
/
prepare_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
"""
split data into training data and testing data in two separated files
"""
import numpy as np
import re
"""
Split Ryan's data into two parts: TRAIN and TEST with k_fold cross validation
"""
def split_data_ryan(INPUT,TRAIN,TEST,k_fold = 5):
# k-fold cross-validation
OUTPUT = './data/Ryan/10KLabeledTweets_formatted.txt'
# properly format the data
output = open(OUTPUT, 'w')
output.write('Tweet ID\tInformation Source\tInformation Type\tDisaster-related\tTweet Text')
tweets_count = 0
with open(INPUT) as f:
for line in f.readlines():
# check if this line as a proper formatted tweet
if len(line.split('\t')) == 5:
output.write('\n')
output.write(line.strip())
tweets_count = tweets_count + 1
else:
output.write(' ' + line.strip().replace('\t', ' '))
# data = np.loadtxt(OUTPUT, dtype='str', delimiter='\t', skiprows=1)
# print data.shape
#
# # for i in range(20):
# # print data[i]
# np.savetxt(TEST, data[:test_set_count], fmt='%s', delimiter='\t')
# np.savetxt(TRAIN, data[test_set_count:], fmt='%s', delimiter='\t')
test_set_count = tweets_count/k_fold
test_output = open(TEST, 'w')
train_output = open(TRAIN, 'w')
with open(OUTPUT) as f:
data = f.readlines()
for line in data[1:test_set_count]:
test_output.write(line)
for line in data[test_set_count:]:
train_output.write(line)
"""
Split CrisisLex's data into two parts: TRAIN and TEST with k_fold cross validation
"""
def split_data_cl(INPUT,TRAIN,TEST,k_fold = 5):
# properly format the data
test_output = open(TEST, 'w')
train_output = open(TRAIN, 'w')
with open(INPUT) as f:
data = f.readlines()
test_set_count = len(data) / k_fold
for line in data[1:test_set_count]:
test_output.write(line)
for line in data[test_set_count:]:
train_output.write(line)
test_output.close()
train_output.close()
"""
split both training and testing data into various files, one for each label
"""
def split_data_cl_label(FILE, prefix, labels):
data = {}
data_size = {}
with open(FILE) as f:
for line in f.readlines():
label = line.split('\t')[3]
tweet = line.split('\t')[4]
if label in labels.keys():
if label not in data:
data[label] = tweet
data_size[label] = 1
else: # append
data[label] = data[label] + tweet
data_size[label] = data_size[label] + 1
else:
print "label does not exist", label
print data_size
for file, content in data.items():
f = open('./data/CrisisLex/' + prefix + '_' + labels[file] + '.txt', 'w')
f.write(content)
f.close()
labels = {'Not related' : 'not_related', 'Related and informative' : 'informative', 'Related - but not informative' : 'not_informative', 'Not applicable' : 'not_applicable'}
# split_data_cl_label('./data/training_tweets.txt', 'train', labels)
# split_data_cl_label('./data/testing_tweets.txt', 'test', labels)