-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathautoencoder.py
executable file
·181 lines (124 loc) · 5.67 KB
/
autoencoder.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
#!/usr/bin/python3.4
import numpy as np
from keras import regularizers
from keras.layers import Input, Dense
from keras.models import Model
import csv
import pickle
import os.path
# each column is an entity
# TODO: don't hardcode these values
# 13870 entity vectors of 1000 dimensions each
fname_entity_row_dict='entity_row_dict.pkl'
fname_entity_vecs='entity_vecs.npy'
# GET TABLE OF word2vec VECTORS (maps relation code -> word2vec vector)
if not os.path.isfile(fname_entity_row_dict) or not os.path.isfile(fname_entity_vecs):
entities = np.zeros((13870,1000))
#entities = np.array([])
entity_row_dict = {}
entity_cnt=0
with open('data/pruned_vec.csv', newline='') as csvfile:
#entities_reader = csv.reader(csvfile, delimiter=', ') # use this if space separated only
entities_reader = csv.reader((line.replace(', ', ' ') for line in csvfile), delimiter=' ')
for line in entities_reader:
# extract entity from line
entity = np.asarray(list(map(float, line[1:])))
# store entity vector in numpy entities matrix
entities[entity_cnt] = entity
# store entity row in dictionary
entity_row_dict[line[0]] = entity_cnt
entity_cnt = entity_cnt + 1
np.save(fname_entity_vecs, entities)
with open(fname_entity_row_dict, 'wb') as f:
pickle.dump(entity_row_dict, f)
#print(entity_cnt)
else:
entities=np.load(fname_entity_vecs)
with open (fname_entity_row_dict, 'rb') as f:
entity_row_dict = pickle.load(f)
# GET TABLE OF ONE-HOT VECTORS FOR EACH RELATIONSHIP
# TODO: don't hardcode these values
# 1311 relation vectors of 1311 dimensions each (one-hot vectors) -- each row is a relationship vector
fname_relationship_row_dict='relationship_row_dict.pkl'
fname_relationship_vecs='relationship_vecs.npy'
if not os.path.isfile(fname_relationship_row_dict) or not os.path.isfile(fname_relationship_vecs):
relationships = np.zeros((1311,1311))
relationship_row_dict = {}
relationship_cnt=0
with open('data/pruned_relations.txt', newline='') as rfile:
for line in rfile:
# initialize relationship vector to all zeros
relationship = np.zeros((1311))
# set 1-hot value to current line number
relationship[relationship_cnt] = 1
# store relationship vector in numpy relationships matrix
relationships[relationship_cnt] = relationship
# store relationship row in dictionary
relationship_row_dict[line[:-1]] = relationship_cnt
relationship_cnt = relationship_cnt + 1
np.save(fname_relationship_vecs, relationships)
with open(fname_relationship_row_dict, 'wb') as f:
pickle.dump(relationship_row_dict, f)
#print(relationship_cnt)
else:
relationships=np.load(fname_relationship_vecs)
with open (fname_relationship_row_dict, 'rb') as f:
relationship_row_dict = pickle.load(f)
# GET HEAD, RELATIONSHIP TRAINING VECTORS
# TODO: don't hardcode these values
# 388770 training triples each with a head (1000-dimensional word2vec vector) and relationship (1311-dimensional one-hot vector)
# each row represents one head, relation vector
fname_head_relation_training_vecs='head_relation_training_vecs.npy'
if not os.path.isfile(fname_head_relation_training_vecs):
head_relation_vecs = np.zeros((388770,2311))
head_relation_vec_cnt=0
with open('data/train_pruned.txt', newline='') as tfile:
training_reader = csv.reader(tfile, delimiter='\t')
for line in training_reader:
# extract head entity
head_entity = entities[entity_row_dict[line[0]]]
# extract relationship
relationship = relationships[relationship_row_dict[line[1]]]
# store head-relation vector in numpy head-relation vector matrix
head_relation_vecs[head_relation_vec_cnt] = np.concatenate([head_entity, relationship])
head_relation_vec_cnt = head_relation_vec_cnt + 1
np.save(fname_head_relation_training_vecs, head_relation_vecs)
print(head_relation_vec_cnt)
else:
relationships=np.load(fname_head_relation_training_vecs)
print(relationships.shape)
encoding_dim = 200
# 1311 unique relations in training set
# relations represented as 1311-dimensional one-hot vectors
# entities represented as 1000-dimensional word2vec vectors
# input layer therefore has 2311 dimensions
# AUTO ENCODER
input_layer = Input(shape=(2311,))
# add a Dense layer with a L1 activity regularizer
encoded = Dense(encoding_dim, activation='sigmoid')(input_layer)
decoded = Dense(2311, activation='sigmoid')(encoded)
# this model maps an input to its reconstruction
autoencoder = Model(input_layer, decoded)
# ENCODER
# this model maps an input to its encoded representation
encoder = Model(input_layer, encoded)
# DECODER
# create a placeholder for an encoded (200-dimensional) input
encoded_input = Input(shape=(encoding_dim,))
# retrieve the last layer of the autoencoder model
decoder_layer = autoencoder.layers[-1]
# create the decoder model
decoder = Model(encoded_input, decoder_layer(encoded_input))
# TRAIN
autoencoder.compile(optimizer='adadelta', loss='binary_crossentropy')
from keras.datasets import mnist
import numpy as np
(x_train, _), (x_test, _) = mnist.load_data()
x_train = x_train.astype('float32') / 255.
x_test = x_test.astype('float32') / 255.
x_train = x_train.reshape((len(x_train), np.prod(x_train.shape[1:])))
x_test = x_test.reshape((len(x_test), np.prod(x_test.shape[1:])))
print(x_train.shape)
print(x_test.shape)
x_train = entities
print("haro long time")