-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathcomparable_model1.py
864 lines (693 loc) · 35.6 KB
/
comparable_model1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
from theano import *
from lasagne.layers import InputLayer, get_output
import lasagne
import lasagne.layers
import theano.tensor as T
import theano
import numpy as np
from helpers import SimpleMaxingLayer, SimpleAverageLayer
from wordvecs import WordVectors, EmbeddingLayer, WordTokenizer
from wikireader import WikiRegexes
#import json
import re
import random
import sys
theano.config.floatX = 'float32'
#theano.config.linker = 'cvm_nogc'
theano.config.openmp = True
theano.config.openmp_elemwise_minsize = 20000
from __main__ import baseModel, featureNames as featuresNames, disable_convs
class EntityVectorLinkExp(baseModel):
batch_size = 250 #20000
num_training_items = 500000 #200000
dim_compared_vec = 150 # 100
def __init__(self):
self.sentence_length = self.wordvecs.sentence_length
self.sentence_length_short = 10
self.document_length = 100
self.num_words_to_use_conv = 5
self.enable_boosting = False
self.num_negative_target_samples = 0 #1
#self.enable_match_surface = False
#self.enable_link_counts = True
self.enable_train_wordvecs = False
self.enable_cap_boosting = True
self.num_indicator_features = len(featuresNames)
self.main_nl = lasagne.nonlinearities.softmax# leaky_rectify
self.impossible_query = featuresNames.index('Impossible')
self._setup()
def _setup(self):
self.all_params = []
self.all_conv_results = []
self.all_conv_pool_results = []
self.all_conv_names = []
self.x_document_input = T.imatrix('x_doc') # words from the source document
self.x_document_id = T.ivector('x_doc_id') # index of which source doucment this is from
self.x_surface_text_input = T.imatrix('x_surface_link') # text of the surface link
self.x_surface_context_input = T.imatrix('x_surface_cxt') # words surrounding the surface link
self.x_target_input = T.ivector('x_target') # id of the target vector
self.x_target_words = T.imatrix('x_target_words') # words from the target title link
self.x_matches_surface = T.ivector('x_match_surface') # indicator if the target title matches the surface
self.x_matches_counts = T.imatrix('x_matches_counts') # info about the link counts
self.x_target_document_words = T.imatrix('x_target_document_words') # words from the body of target document
self.x_link_id = T.ivector('x_link_id') # indx of what link to compare to in the matrix
self.x_denotaiton_features = T.matrix('x_denotation_ind_feats', dtype='int8') # the joint denotation query features
self.x_query_featurs = T.matrix('x_query_ind_feats', dtype='int8') # the query features
self.x_query_link_id = T.ivector('x_match_query') # the query that a denotation links to
self.x_denotation_ranges = T.imatrix('x_denotation_ranges') # the range of joint denotations to sum over
self.x_target_link_id = T.ivector('x_match_target') # the target document that maches with a given denotation
self.y_isgold = T.vector('y_gold', dtype='int8') # is 1 if the gold item, 0 otherwise
self.y_grouping = T.imatrix('y_grouping') # matrix containing [start_idx, end_idx, gold_idx]
self.embedding_W = theano.shared(self.wordvecs.get_numpy_matrix().astype(theano.config.floatX),name='embedding_W')
self.embedding_W_docs = theano.shared(self.documentvecs.get_numpy_matrix().astype(theano.config.floatX),name='embedding_W_docs')
def augRectify(x):
# if x is zero, then the gradient failes due to computation: x / |x|
return T.maximum(x, -.01 * x)
simpleConvNonLin = augRectify
# self.document_l = lasagne.layers.InputLayer(
# (None,self.document_length),
# input_var=self.x_document_input
# )
# self.document_embedding_l = EmbeddingLayer(
# self.document_l,
# W=self.embedding_W,
# add_word_params=self.enable_train_wordvecs,
# )
# self.document_simple_conv1_l = lasagne.layers.Conv2DLayer(
# self.document_embedding_l,
# num_filters=self.dim_compared_vec,
# filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
# name='document_simple_conv',
# nonlinearity=simpleConvNonLin,
# )
# self.all_conv_names.append('document_conv')
# self.all_conv_results.append(lasagne.layers.get_output(self.document_simple_conv1_l))
# self.document_simple_sum_l = lasagne.layers.Pool2DLayer(
# self.document_simple_conv1_l,
# name='document_simple_pool',
# pool_size=(self.document_length - self.num_words_to_use_conv, 1),
# mode='sum',
# )
# self.all_conv_pool_results.append(lasagne.layers.get_output(self.document_simple_sum_l))
# self.document_output = lasagne.layers.get_output(
# lasagne.layers.reshape(self.document_simple_sum_l, ([0],-1)))
# self.all_params += lasagne.layers.get_all_params(self.document_simple_sum_l)
# ##########################################
# ## surface text
# self.surface_context_l = lasagne.layers.InputLayer(
# (None, self.sentence_length),
# input_var=self.x_surface_context_input,
# )
# self.surface_context_embedding_l = EmbeddingLayer(
# self.surface_context_l,
# W=self.embedding_W,
# add_word_params=self.enable_train_wordvecs,
# )
# self.surface_context_conv1_l = lasagne.layers.Conv2DLayer(
# self.surface_context_embedding_l,
# num_filters=self.dim_compared_vec,
# filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
# name='surface_cxt_conv1',
# nonlinearity=simpleConvNonLin,
# )
# self.all_conv_names.append('surface_context_conv')
# self.all_conv_results.append(lasagne.layers.get_output(self.surface_context_conv1_l))
# self.surface_context_pool1_l = lasagne.layers.Pool2DLayer(
# self.surface_context_conv1_l,
# name='surface_cxt_pool1',
# pool_size=(self.sentence_length - self.num_words_to_use_conv, 1),
# mode='sum', # WAS 'MAX' FOR SOME REASON
# )
# self.all_conv_pool_results.append(lasagne.layers.get_output(self.surface_context_pool1_l))
# self.surface_output = lasagne.layers.get_output(
# lasagne.layers.reshape(self.surface_context_pool1_l, ([0], -1))
# )
# self.all_params += lasagne.layers.get_all_params(self.surface_context_pool1_l)
self.surface_input_l = lasagne.layers.InputLayer(
(None, self.sentence_length_short),
input_var=self.x_surface_text_input
)
self.surface_embedding_l = EmbeddingLayer(
self.surface_input_l,
W=self.embedding_W,
add_word_params=self.enable_train_wordvecs,
)
# self.surface_conv1_l = lasagne.layers.Conv2DLayer(
# self.surface_embedding_l,
# num_filters=self.dim_compared_vec,
# filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
# name='surface_conv1',
# nonlinearity=simpleConvNonLin,
# )
# self.all_conv_names.append('surface_conv')
# self.all_conv_results.append(lasagne.layers.get_output(self.surface_conv1_l))
self.surface_pool1_l = lasagne.layers.Pool2DLayer(
lasagne.layers.reshape(self.surface_embedding_l, ([0], [3], [2], [1])),
#self.surface_conv1_l,
name='surface_pool1',
pool_size=(self.sentence_length_short, 1),
mode='average_exc_pad', #sum',
)
self.all_conv_pool_results.append(lasagne.layers.get_output(self.surface_pool1_l))
self.surface_words_output = lasagne.layers.get_output(
lasagne.layers.reshape(self.surface_pool1_l, ([0], -1))
)
self.all_params += lasagne.layers.get_all_params(self.surface_pool1_l)
###################################################
## dealing with the target side
# matched_surface_reshaped = self.x_matches_surface.reshape(
# (self.x_matches_surface.shape[0], 1, 1, 1)).astype(theano.config.floatX)
self.target_input_l = lasagne.layers.InputLayer(
(None,),
input_var=self.x_target_input
)
# words from the title of the target
# self.target_words_input_l = lasagne.layers.InputLayer(
# (None,self.sentence_length_short),
# input_var=self.x_target_words,
# )
# self.target_words_embedding_l = EmbeddingLayer(
# self.target_words_input_l,
# W=self.embedding_W,
# add_word_params=self.enable_train_wordvecs,
# )
# self.target_words_conv1_l = lasagne.layers.Conv2DLayer(
# self.target_words_embedding_l,
# name='target_wrds_conv1',
# filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
# num_filters=self.dim_compared_vec,
# nonlinearity=simpleConvNonLin,
# )
# self.all_conv_names.append('target_title_conv')
# self.all_conv_results.append(lasagne.layers.get_output(self.target_words_conv1_l))
# self.target_words_pool1_l = lasagne.layers.Pool2DLayer(
# self.target_words_conv1_l,
# name='target_wrds_pool1',
# pool_size=(self.sentence_length_short - self.num_words_to_use_conv, 1),
# mode='sum',
# )
# self.all_conv_pool_results.append(lasagne.layers.get_output(self.target_words_pool1_l))
# self.target_title_out = lasagne.layers.get_output(
# lasagne.layers.reshape(self.target_words_pool1_l, ([0],-1))
# )
# self.all_params += lasagne.layers.get_all_params(self.target_words_pool1_l)
# words from the body of the target
self.target_body_words_input_l = lasagne.layers.InputLayer(
(None,self.sentence_length),
input_var=self.x_target_document_words,
)
self.target_body_words_embedding_l = EmbeddingLayer(
self.target_body_words_input_l,
W=self.embedding_W,
add_word_params=self.enable_train_wordvecs,
)
# self.target_body_simple_conv1_l = lasagne.layers.Conv2DLayer(
# self.target_body_words_embedding_l,
# name='target_body_simple_conv',
# filter_size=(self.num_words_to_use_conv, self.wordvecs.vector_size),
# num_filters=self.dim_compared_vec,
# nonlinearity=simpleConvNonLin,
# )
# self.all_conv_names.append('target_body_conv')
# self.all_conv_results.append(lasagne.layers.get_output(self.target_body_simple_conv1_l))
self.target_body_simple_sum_l = lasagne.layers.Pool2DLayer(
lasagne.layers.reshape(self.target_body_words_embedding_l, ([0], [3], [2], [1])),
#self.target_body_simple_conv1_l,
name='target_body_simple_sum',
pool_size=(self.sentence_length, 1),
mode='average_exc_pad', #sum',
)
self.all_conv_pool_results.append(lasagne.layers.get_output(self.target_body_simple_sum_l))
self.target_out = lasagne.layers.get_output(
lasagne.layers.reshape(self.target_body_simple_sum_l, ([0],-1)))
self.all_params += lasagne.layers.get_all_params(self.target_body_simple_sum_l)
#########################################################
## compute the cosine distance between the two layers
# the are going to multiple entity links per document so we have the `_id` ivectors that represent how
# we need to reshuffle the inputs, this saves on computation
# source body
# self.source_aligned_l = self.document_output[self.x_document_id,:][self.x_link_id,:]
# # source context
# self.source_context_aligned_l = self.surface_output[self.x_link_id,:]
# source surface words
self.source_surface_words_aligned_l = self.surface_words_output[self.x_link_id,:]
def augNorm(v):
return T.basic.pow(T.basic.pow(T.basic.abs_(v), 2).sum(axis=1) + .001, .5)
def cosinsim(a, b):
dotted = T.batched_dot(a, b)
return dotted / (augNorm(a) * augNorm(b))
def comparedVLayers(a, b):
dv = cosinsim(a, b)
return lasagne.layers.InputLayer(
(None,1),
input_var=dv.reshape((dv.shape[0], 1))
)
self.cosine_conv_layers = []
for i, l in enumerate([
#comparedVLayers(self.target_out, self.source_aligned_l),
#comparedVLayers(self.target_out, self.source_context_aligned_l),
comparedVLayers(self.target_out, self.source_surface_words_aligned_l),
#comparedVLayers(self.target_title_out, self.source_aligned_l),
#comparedVLayers(self.target_title_out, self.source_context_aligned_l),
#comparedVLayers(self.target_title_out, self.source_surface_words_aligned_l),
]):
if i not in disable_convs:
self.cosine_conv_layers.append(l)
self.cosine_combined = lasagne.layers.concat(
self.cosine_conv_layers,
axis=1
)
self.cosine_weighted = lasagne.layers.DenseLayer(
self.cosine_combined,
name='cosine_dens1',
num_units=1,
b=None,
nonlinearity=lasagne.nonlinearities.linear,
)
self.cosine_weighted.W.get_value(borrow=True)[:] += 1
self.cosine_output = lasagne.layers.get_output(
lasagne.layers.reshape(self.cosine_weighted, (-1,)))
self.all_params += lasagne.layers.get_all_params(self.cosine_weighted)
# ######################################################
# ## indicator feature input
# self.query_feat_l = lasagne.layers.InputLayer(
# (None,self.num_indicator_features),
# input_var=self.x_query_featurs,
# )
# #rank_feats = [f[0] for f in enumerate(featuresNames) if f[1].startswith('Rank=')]
# self.denotation_join_feat_l = lasagne.layers.InputLayer(
# (None,self.num_indicator_features),
# input_var=self.x_denotaiton_features,#[:, rank_feats],
# )
# ## the query and denotation features are now combined when inputed into the same denotation vector
# # self.query_layer_l = lasagne.layers.DenseLayer(
# # self.query_feat_l,
# # name='query_lin',
# # num_units=1,
# # nonlinearity=lasagne.nonlinearities.linear,
# # )
# # self.query_output = lasagne.layers.get_output(
# # lasagne.layers.reshape(self.query_layer_l, (-1,))
# # )
# # self.all_params += lasagne.layers.get_all_params(self.query_layer_l)
# # self.aligned_queries = self.query_output[self.x_query_link_id]
self.aligned_cosine = self.cosine_output[self.x_target_link_id]
# self.denotation_layer_l = lasagne.layers.DenseLayer(
# self.denotation_join_feat_l,
# name='denotation_lin',
# num_units=1,
# nonlinearity=lasagne.nonlinearities.linear,
# #W=self.query_layer_l.W,
# )
# self.denotation_output = lasagne.layers.get_output(
# lasagne.layers.reshape(self.denotation_layer_l, (-1,)))
# self.all_params += lasagne.layers.get_all_params(self.denotation_layer_l)
###########################
## multiply the two parts of the join scores
self.unmerged_scores = (
# ( #(self.aligned_queries) +
# (self.denotation_output))
# +
self.aligned_cosine
)
#############################################
## normalizing the scores and recombining
## the output if there were multiple entries
## for the same target document
#############################################
def sloppyMathLogSum(vals):
m = vals.max()
return T.log(T.exp(vals - m).sum()) + m
def mergingSum(indx, unmerged):
return sloppyMathLogSum(unmerged[T.arange(indx[0], indx[1])])
self.merged_scores, _ = theano.scan(
mergingSum,
sequences=[self.x_denotation_ranges],
non_sequences=[self.unmerged_scores]
)
########################################
## true output values
########################################
self.unscaled_output = self.merged_scores
def scaleRes(indx, outputs, res):
ran = T.arange(indx[0], indx[1])
s = sloppyMathLogSum(res[ran])
return T.set_subtensor(outputs[ran], res[ran] - s)
self.scaled_scores, _ = theano.scan(
scaleRes,
sequences=[self.y_grouping],
non_sequences=[self.unscaled_output],
outputs_info=T.zeros((self.unscaled_output.shape[0],))
)
self.true_output = self.scaled_scores[-1]
############################
## compute the loss
############################
def lossSum(indx, res):
return sloppyMathLogSum(res[T.arange(indx[0], indx[1])])
self.groupped_res, _ = theano.scan(
lossSum,
sequences=[self.y_grouping],
non_sequences=[self.true_output],
)
def selectGolds(indx, res, golds):
r = T.arange(indx[0], indx[1])
# fix some issue with theano?
# the gold value should simply comes from the input
# so there is no good reason to have to disconnect the graident here
gs = theano.gradient.disconnected_grad(golds[r])
vals = gs * res[r] + (1 - gs) * -1000000 # approx 0
return sloppyMathLogSum(vals)
self.gold_res, _ = theano.scan(
selectGolds,
sequences=[self.y_grouping],
non_sequences=[self.true_output, self.y_isgold],
)
self.loss_vec = self.groupped_res - self.gold_res
self.loss_scalar = self.loss_vec.sum()
self.updates = lasagne.updates.adadelta(
self.loss_scalar / self.loss_vec.shape[0] ,
self.all_params)
self.func_inputs = [
self.x_document_input,
self.x_surface_text_input, self.x_surface_context_input, self.x_document_id,
self.x_target_input, self.x_matches_surface, self.x_matches_counts, self.x_link_id,
self.x_target_words, self.x_target_document_words,
self.x_denotaiton_features, self.x_query_featurs, self.x_query_link_id, self.x_denotation_ranges,
self.x_target_link_id,
self.y_grouping,
self.y_isgold,
]
self.func_outputs = [
self.true_output,
self.loss_vec.sum(),
self.loss_scalar,
self.loss_vec,
#self.res_l,
]
self.train_func = theano.function(
self.func_inputs,
self.func_outputs,
updates=self.updates,
on_unused_input='ignore',
)
self.test_func = theano.function(
self.func_inputs,
self.func_outputs,
on_unused_input='ignore',
)
self.find_conv_active_func = theano.function(
self.func_inputs,
self.all_conv_results,
on_unused_input='ignore',
)
def reset_accums(self):
self.current_documents = []
self.current_surface_context = []
self.current_surface_link = []
self.current_link_id = []
self.current_target_input = []
self.current_target_words = []
self.current_target_body_words = []
self.current_target_matches_surface = []
self.current_target_id = []
self.current_target_is_gold = []
# self.current_target_goal = []
# self.current_feat_indicators = []
self.current_learning_groups = []
self.learning_targets = []
self.current_surface_target_counts = []
#self.current_boosted_groups = []
self.current_queries = []
self.current_denotations_feats_indicators = []
self.current_denotations_related_query = []
self.current_denotations_range = []
self.current_denotation_targets_linked = []
self.failed_match = []
def compute_batch(self, isTraining=True, useTrainingFunc=True, batch_run_func=None):
if isTraining and useTrainingFunc:
func = self.train_func
else:
func = self.test_func
if batch_run_func is None:
batch_run_func = self.run_batch
self.reset_accums()
self.total_links = 0
self.total_loss = 0.0
get_words = re.compile('[^a-zA-Z0-9 ]')
get_link = re.compile('.*?\[(.*?)\].*?')
empty_sentence = np.zeros(self.sentence_length, dtype='int32')
for doc, queries in self.queries.iteritems():
# skip the testing documents while training and vice versa
if queries.values()[0]['training'] != isTraining:
continue
docid = len(self.current_documents)
self.current_documents.append(self.wordvecs.tokenize(doc, length=self.document_length))
for surtxt, targets in queries.iteritems():
self.current_link_id.append(docid)
surid = len(self.current_surface_link)
self.current_surface_context.append(self.wordvecs.tokenize(get_words.sub(' ' , surtxt)))
surlink = get_link.match(surtxt).group(1)
self.current_surface_link.append(self.wordvecs.tokenize(surlink, length=self.sentence_length_short))
surmatch = surlink.lower()
surcounts = self.surface_counts.get(surmatch)
if not surcounts:
self.failed_match.append(surmatch)
surcounts = {}
target_body_words_input = [] # words from the target document
target_words_input = [] # the words from the target title
target_matches_surface = []
target_inputs = [] # the target vector
target_learings = []
target_match_counts = []
target_gold_loc = -1
target_group_start = len(self.current_target_input)
# target_feat_indicators = []
denotations_joint_indicators = []
denotations_linked_query = []
denotations_range = []
denotation_target_linked = []
target_isgold = []
queries_feats_indicators = []
for ind in targets['query_vals']:
query_feats = np.zeros((self.num_indicator_features,), dtype='int8')
# query_feats[ind] = 1
queries_feats_indicators.append(query_feats)
queries_len = len(targets['query_vals'])
for target in set(targets['vals'].keys() +
random.sample(self.documentvecs.reverse_word_location, self.num_negative_target_samples)
) - {None,}:
isGold = target in targets['gold']
wiki_title = WikiRegexes.convertToTitle(target)
cnt_wrds = self.page_content.get(wiki_title) #WikiRegexes.convertToTitle(target))
cnt = self.documentvecs.get_location(wiki_title)
if wiki_title == 'nil':
cnt = 0 # this is the stop symbol location
if cnt is None:
# were not able to find this wikipedia document
# so just ignore tihs result since trying to train on it will cause
# issues
if cnt_wrds is None:
# really know nothing
continue
else:
# we must not have had enough links to this document
# but still have the target text
cnt = 0
if isGold:
target_gold_loc = len(target_inputs)
target_isgold.append(1)
else:
target_isgold.append(0)
target_body_words_input.append(cnt_wrds if cnt_wrds is not None else empty_sentence)
target_words_input.append(self.wordvecs.tokenize(get_words.sub(' ', target), length=self.sentence_length_short))
target_inputs.append(cnt)
# page_content already tokenized
target_matches_surface.append(int(surmatch == target.lower()))
target_learings.append((targets, target))
target_match_counts.append(surcounts.get(wiki_title, 0))
joint_indicators = []
query_idx = []
indicators_place = targets['vals'].get(target)
if indicators_place:
# [queries][indicator id]
for indx in xrange(len(indicators_place[1])):
local_feats = np.zeros((self.num_indicator_features,), dtype='int8')
local_feats[indicators_place[1][indx]] = 1
local_feats[targets['query_vals'][indx]] = 1 # features from the joint
# if isGold: #################################### hack
# local_feats[-1] = 1
joint_indicators.append(local_feats)
query_idx.append(len(self.current_queries) + indx)
else:
raise NotImplementedError()
for indx in xrange(queries_len):
local_feats = np.zeros((self.num_indicator_features,), dtype='int8')
local_feats[self.impossible_query] = 1
joint_indicators.append(local_feats)
query_idx.append(len(self.current_queries) + indx)
start_range = len(denotations_joint_indicators) + len(self.current_denotations_feats_indicators)
denotations_joint_indicators += joint_indicators
denotations_linked_query += query_idx
denotations_range.append([start_range, start_range + len(joint_indicators)])
denotation_target_linked += [len(self.current_target_words) + len(target_words_input) - 1] * len(query_idx)
# indicators = np.zeros((self.num_indicator_features,), dtype='int8')
# if indicators_place:
# indicators[indicators_place[1]] = 1
# target_feat_indicators.append(indicators)
#if wiki_title not in surcounts:
# print surcounts, wiki_title
if target_gold_loc is not None or not isTraining: # if we can't get the gold item
# contain the index of the gold item for these items, so it can be less then it
# gold_loc = (len(self.current_target_goal) + target_gold_loc)
sorted_match_counts = [-4,-3,-2,-1] + sorted(set(target_match_counts))
#print sorted_match_counts
target_match_counts_indicators = [
[
int(s == sorted_match_counts[-1]),
int(s == sorted_match_counts[-2]),
int(s == sorted_match_counts[-3]),
int(0 < s <= sorted_match_counts[-4]),
int(s == 0),
]
for s in target_match_counts
]
# self.current_target_goal += [gold_loc] * len(target_inputs)
self.current_target_input += target_inputs
self.current_target_id += [surid] * len(target_inputs)
self.current_target_words += target_words_input
self.current_target_matches_surface += target_matches_surface
self.current_surface_target_counts += target_match_counts_indicators
self.current_target_body_words += target_body_words_input
# self.current_feat_indicators += target_feat_indicators
self.current_target_is_gold += target_isgold
target_group_end = len(self.current_target_input)
self.current_learning_groups.append(
[target_group_start, target_group_end,
-1 # gold_loc
])
#self.current_boosted_groups.append(targets['boosted'])
self.current_queries += queries_feats_indicators
self.current_denotations_feats_indicators += denotations_joint_indicators
self.current_denotations_related_query += denotations_linked_query
self.current_denotations_range += denotations_range
self.current_denotation_targets_linked += denotation_target_linked
#self.current_target_goal.append(isGold)
self.learning_targets += target_learings
if len(self.current_target_id) > self.batch_size:
# return
batch_run_func(func)
sys.stderr.write('%i\r'%self.total_links)
if self.total_links > self.num_training_items:
return self.total_loss / self.total_links, #self.total_boosted_loss / self.total_links
if len(self.current_target_id) > 0:
batch_run_func(func)
#self.run_batch(func)
return self.total_loss / self.total_links, #self.total_boosted_loss / self.total_links
def run_batch(self, func):
res_vec, loss_sum, _, loss_vec, = func(
self.current_documents,
self.current_surface_link, self.current_surface_context, self.current_link_id,
self.current_target_input, self.current_target_matches_surface, self.current_surface_target_counts, self.current_target_id,
self.current_target_words, self.current_target_body_words, #self.current_feat_indicators,
self.current_denotations_feats_indicators, self.current_queries, self.current_denotations_related_query, self.current_denotations_range,
self.current_denotation_targets_linked,
#self.current_target_goal,
self.current_learning_groups, #self.current_boosted_groups,
self.current_target_is_gold,
)
self.check_params()
self.total_links += len(self.current_target_id)
self.total_loss += loss_sum
#self.total_boosted_loss += loss_boosted
learned_groups = [] # right...dict not hashable....
for i in xrange(len(res_vec)):
# save the results from this pass
l = self.learning_targets[i]
if l[1] in l[0]['vals']:
l[0]['vals'][ l[1] ][0] = float(res_vec[i]), 0#float(nn_outs[i])
if l[0] not in learned_groups:
learned_groups.append(l[0])
self.reset_accums()
def run_batch_max_activate(self, _):
res = self.find_conv_active_func(
self.current_documents,
self.current_surface_link, self.current_surface_context, self.current_link_id,
self.current_target_input, self.current_target_matches_surface, self.current_surface_target_counts, self.current_target_id,
self.current_target_words, self.current_target_body_words,
self.current_denotations_feats_indicators, self.current_queries, self.current_denotations_related_query, self.current_denotations_range,
self.current_denotation_targets_linked,
self.current_learning_groups,
self.current_target_is_gold,
)
self.check_params()
self.total_links += len(self.current_target_id)
self.total_loss += 0
# need to match with the conv names/results
conv_inputs = [
# shape: (document index, number of words)
np.array(self.current_documents),
np.array(self.current_surface_context),
np.array(self.current_surface_link),
np.array(self.current_target_words),
np.array(self.current_target_body_words),
]
# res shape: (document index, num filters, output rows, output columns [word vectors, should be 1])
for i in xrange(len(res)):
conv_len = conv_inputs[i].shape[1] - res[i].shape[2] + 1
for dim in xrange(self.dim_compared_vec):
current_min = self.conv_max[i][dim][0][0]
higher_p = res[i][:, dim, :, 0] > current_min
if higher_p.any():
higher_where = np.where(higher_p)
higher_vals = res[i][higher_where[0], dim, higher_where[1], 0]
current_words = set(w[1] for w in self.conv_max[i][dim])
higher_words = [
# np won't do this selecting in one shot
conv_inputs[i][higher_where[0][w], higher_where[1][w]:(higher_where[1][w]+conv_len)]
for w in xrange(len(higher_where[0]))
]
#higher_words = conv_inputs[i][higher_where[0], higher_where[1]:(higher_where[1]+conv_len)]
itm_arr = self.conv_max[i][dim]
for x in xrange(len(higher_vals)):
hv = higher_vals[x]
if hv > itm_arr[0][0]:
# this is higher then the current min value
words = tuple(higher_words[x])
if words not in current_words:
# we should remove the min element
itm_arr[0] = (hv, words)
itm_arr.sort()
current_words = set(w[1] for w in itm_arr)
self.reset_accums()
def find_max_convs(self, num_per_activation=10):
# disable
return
assert len(self.all_conv_names) == 5
assert len(self.all_conv_results) == 5
#per_act = [(0, ())] * num_per_activation
#per_conv = [per_act] * self.dim_compared_vec
#self.conv_max = [per_conv] * len(self.all_conv_names)
self.conv_max = [
[
[
(0, ())
for c in xrange(num_per_activation)
]
for b in xrange(self.dim_compared_vec)
]
for a in xrange(len(self.all_conv_names))
]
self.compute_batch(True, batch_run_func=self.run_batch_max_activate)
self.compute_batch(False, batch_run_func=self.run_batch_max_activate)
# convert the second part of the arrays to strings of words
for ci in xrange(len(self.all_conv_names)):
for di in xrange(self.dim_compared_vec):
for ai in xrange(num_per_activation):
a = self.conv_max[ci][di][ai]
self.conv_max[ci][di][ai] = (a[0], ' '.join([str(self.wordvecs.get_word(w)) for w in a[1]]))
return self.conv_max
def check_params(self):
if any([np.isnan(v.get_value(borrow=True)).any() for v in self.all_params]):
raise RuntimeError('nan in some of the parameters')
queries_exp = EntityVectorLinkExp()