-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathprolife-prochoice.py
155 lines (112 loc) · 3.91 KB
/
prolife-prochoice.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# -*- coding: utf-8 -*-
#------------------------------------------------------------------------------+
#
#
# 2018-June.
# Contributors:
# 1. Nam Le -- University College Dublin
# 2.
# 3.
#
#------------------------------------------------------------------------------+
#------ Processing the labels of the raw abortion data --------+
import os
from sentence_permutation import vary_by_noun
import numpy as np
np.random.seed(10000)
abortion_dir = 'data_files/abortion'
train_dir = os.path.join(abortion_dir, 'train2')
labels = []
texts = []
for label_type in ['pro', 'against']:
dir_name = os.path.join(train_dir, label_type)
for fname in os.listdir(dir_name):
if fname[-4:] == '.txt':
f = open(os.path.join(dir_name, fname), encoding="utf-8")
for line in f:
texts.append(line)
if label_type == 'against':
labels.append(0)
else:
labels.append(1)
f.close()
print("length of the texts ", len(texts))
print("length of the labels ", len(labels))
for i in range(len(texts)):
varied_sentence1 = vary_by_noun(texts[i])
varied_sentence2 = vary_by_noun(texts[i])
texts.append(varied_sentence1)
labels.append(labels[i])
texts.append(varied_sentence2)
labels.append(labels[i])
print("length of the texts ", len(texts))
print("length of the labels ", len(labels))
#------------ TOKENIZING THE DATA -----------_+
"""Tokenizing the text of the raw IMDB data
"""
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, SimpleRNN, Dense
# cuts off reviews after 100 words
maxlen = 500
# trains on 10000 samples
training_samples = len(texts)
# validates on 10000 samples
#validation_samples = 10183
# considers only the top 20000 words in the dataset
max_words = 20000
tokenizer = Tokenizer()#(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
labels = np.asarray(labels)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)
# Split the data into a training set and a validation set
# But first, shuffle the data, since we started from data
# where sample are ordered (all negative first, then all positive).
indices = np.arange(data.shape[0])
#print(indices)
np.random.shuffle(indices)
#print(indices)
data = data[indices]
labels = labels[indices]
x_train = data[:training_samples]
y_train = labels[:training_samples]
#x_val = data[training_samples: training_samples + validation_samples]
#y_val = labels[training_samples: training_samples + validation_samples]
#x_val = data[:training_samples]
#y_val = labels[:training_samples]
from keras.layers import LSTM
model = Sequential()
model.add(Embedding(len(texts), 32))
model.add(LSTM(32))
#model.add(LSTM(32))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='rmsprop',
loss='binary_crossentropy',
metrics=['acc'])
history = model.fit(x_train, y_train,
epochs=10,
batch_size=128,
validation_split=0.2)
import matplotlib.pyplot as plt
acc = history.history['acc']
val_acc = history.history['val_acc']
loss = history.history['loss']
val_loss = history.history['val_loss']
epochs = range(len(acc))
plt.plot(epochs, acc, 'bo', label='Training acc')
plt.plot(epochs, val_acc, 'b', label='Validation acc')
plt.title('Training and validation accuracy')
plt.legend()
plt.figure()
plt.plot(epochs, loss, 'bo', label='Training loss')
plt.plot(epochs, val_loss, 'b', label='Validation loss')
plt.title('Training and validation loss')
plt.legend()
plt.show()