-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmain.py
95 lines (68 loc) · 3.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
"""
"""
import os
import platform
from create_vocab_from_tweets import read_data, save_vocab_tweets, make_vocab
from svm_preprocess import get_vocab, read_train_data, read_test_data, save_data
from prepare_data import split_data_ryan, split_data_cl
#print os.system("python affected_unaffected_filter.py")
#second last parameter decides the classification filter = 0 or hash tag filter = 1
# last parameter is to select between the two sentiment analyzer - Sentistrength = 0 and word2vec_sentifier = 1
print os.system("python word2vec_tweet_filter.py")
print "done"
#print os.system("python hash_tag.py")
# using proper library depends on OS type
platform = platform.system()
if platform == 'Windows':
svm_learn = '.\lib\svm_learn.exe'
svm_classify = '.\lib\svm_classify.exe'
else:
svm_learn = './lib/svm_learn'
svm_classify = './lib/svm_classify'
# PARAMETERS ##############################################
DATASET = "ryan" # CrisisLex
# merge crisislex datasets into one file ###############################
# os.system("python process_crisislex.py")
# print "Created training data from CrisisLex26"
# prepare training and testing data ###############################
#print os.system("python word2vec.py ./data/CrisisLex/CrisisLex27K.csv 1000")
#print os.system("python word2vec_fast.py ./data/CrisisLex/CrisisLex27K.csv 1000")
#print os.system("python word2vec_fast.py ./data/Ryan/10KLabeledTweets_confidence.csv 335")
#print os.system("python word2vec_tweet_filter.py ./data/Ryan/10KLabeledTweets_confidence.csv 295 ./data/earthquake_sentiment/2014-08-24_06.txt ./data/earthquake_sentiment/logistic_pred_output.txt ./data/earthquake_sentiment/output.txt 0")
TRAIN, TEST= "./data/training_tweets.txt", "./data/testing_tweets.txt"
if DATASET == 'cl':
INPUT = './data/CrisisLex/CrisisLex27K.csv'
split_data_cl(INPUT, TRAIN, TEST)
labels_map = {'Not related': -1, 'Related and informative': 1, 'Related - but not informative': 1}
elif DATASET == 'ryan':
INPUT = './data/Ryan/10KLabeledTweets.txt'
split_data_ryan(INPUT, TRAIN, TEST)
labels_map = {'Relevant': 1, 'Not Relevant': -1}
print "Prepared training and testing data"
# create vocabulary ###############################
output_filename = "./data/refined_training_tweets.txt"
vocab_filename = "./data/Tweets.vocab"
refined_tweets, save_tweets = read_data(TRAIN, tweet_index=4)
save_vocab_tweets(save_tweets, output_filename)
make_vocab(refined_tweets, vocab_filename)
print "Created refined training data and vocabulary file"
# prepare training and testing inputs ###############################
test_text_index, test_label_index, train_text_index, train_label_index = 4, 3, 4, 3
train_file, test_file, tweet_model = "./data/Tweet_Train.txt", "./data/Tweet_Test.txt", "./output/tweet.model"
tweet_vocab = get_vocab(vocab_filename)
train_data = read_train_data(TRAIN, tweet_vocab, labels_map, train_text_index, train_label_index, delimiter='\t')
test_data = read_test_data(TEST, tweet_vocab, labels_map, test_text_index, test_label_index, delimiter='\t')
save_data(train_data, test_data, train_file, test_file)
print "Preprocess train and test data and output two files in SVM format: " + train_file + "\t" + test_file
## classify tweets ###############################
print os.system(svm_learn + " " + train_file + " " + tweet_model)
print "Created model ./output/tweet.model"
svm_output = "./output/svm_output.txt"
print os.system(svm_classify + " " + test_file + " " + tweet_model + " " + svm_output)
print "Output classifier result", svm_output
# classify tweets ###############################
# print os.system("python svm_postprocess.py")
# print os.system("python categorization.py")
#
# print os.system("python sentiment_analysis.py")
print ("Finished")