-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrepresentations.py
160 lines (116 loc) · 6.46 KB
/
representations.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
import numpy as np
import string
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
def tfidf_repr(sentences, stopwords, ngram_range):
"""
Args:
stemmed: List of sentences, every sentence as a string.
stopwords: List of stopwords.
Returns: TF*IDF representation of every sentence (np-array (#sentences * #tfs))
"""
tfidf = TfidfVectorizer(stop_words=stopwords, ngram_range=ngram_range)
tfidf_matrix = tfidf.fit_transform(sentences).toarray()
idf_weight_dict = dict(zip(tfidf.get_feature_names(), tfidf.idf_))
# Produce of dummy original indices (as in this case, all sentences are represented)
original_indices = list(range(len(sentences)))
return original_indices, tfidf_matrix, idf_weight_dict
def bigram_repr(stemmed, stopwords, svd, bigram_list):
multi_hot_bigram_vectors = []
for sentence in stemmed:
# Exclude stopwords and punctuation and use stemmed words
split_sentence = [stem for word, stem in sentence]
# Turn sentence into list of bigrams
bigrams = list(nltk.bigrams(split_sentence))
# Make TF vector of bigram list, based on all bigrams in the data set
multi_hot = np.array([v if k in bigrams else 0 for k, v in bigram_list])
multi_hot_bigram_vectors.append(multi_hot)
# Filter out non-hot vectors, and save original indices
repr_list_with_orig_indices = [(i, s) for i, s in enumerate(multi_hot_bigram_vectors) if
np.sum(s) != 0]
original_indices, sentence_representations = zip(*repr_list_with_orig_indices)
sentence_array = np.array(sentence_representations)
sentence_repr = svd.transform(sentence_array)
return original_indices, sentence_repr
def w2v_sentence_sums(tokenized, model, postagged, tagfilter):
# define minimum number of words in the sentence that are also in the word model.
min_word_vectors = 1
# lowercase & split tokenized sentences
preprocessed = [sentence.lower().split(' ')
for sentence in tokenized]
# POS-tag filtering, and punctuation removal
preprocessed = [[word.translate(str.maketrans('', '', string.punctuation))
for word_index, word in enumerate(sentence)]
for sentence_index, sentence in enumerate(preprocessed)]
vectorized = [[model.model[word] for word in sentence
if word in model.model.vocab]
for sentence in preprocessed]
sentence_sums_with_indices = [(index, np.sum(s, axis=0))
for index, s in enumerate(vectorized)
if len(s) >= min_word_vectors]
# With this, we can obtain original sentences by doing sentences[original_indices[index_of_vector]]
original_indices, sentence_sums = zip(*sentence_sums_with_indices)
return original_indices, np.array(sentence_sums)
def w2v_sentence_means(tokenized, model):
# define minimum number of words in the sentence that are also in the word model.
min_word_vectors = 1
# lowercase & split tokenized sentences
preprocessed = [sentence.lower().split(' ')
for sentence in tokenized]
# POS-tag filtering, and punctuation removal
preprocessed = [[word.translate(str.maketrans('', '', string.punctuation))
for word_index, word in enumerate(sentence)]
for sentence_index, sentence in enumerate(preprocessed)]
vectorized = [[model.model[word] for word in sentence
if word in model.model.vocab]
for sentence in preprocessed]
sentence_sums_with_indices = [(index, np.mean(s, axis=0))
for index, s in enumerate(vectorized)
if len(s) >= min_word_vectors]
# With this, we can obtain original sentences by doing sentences[original_indices[index_of_vector]]
original_indices, sentence_sums = zip(*sentence_sums_with_indices)
return original_indices, np.array(sentence_sums)
def w2v_sentence_sums_tfidf(tokenized, model, idf_weight_dict):
# define minimum number of words in the sentence that are also in the word model.
min_word_vectors = 1
# lowercase & split tokenized sentences
preprocessed = [sentence.lower().split(' ')
for sentence in tokenized]
# POS-tag filtering, and punctuation removal
preprocessed = [[word.translate(str.maketrans('', '', string.punctuation))
for word in sentence]
for sentence in preprocessed]
# Remove OOV and non-TFIDF words
vectorized = [[model.model[word]*idf_weight_dict[word] for word in sentence
if word in model.model.vocab
and word in idf_weight_dict]
for sentence in preprocessed]
sentence_sums_with_indices = [(index, np.sum(s, axis=0))
for index, s in enumerate(vectorized)
if len(s) >= min_word_vectors]
# With this, we can obtain original sentences by doing sentences[original_indices[index_of_vector]]
original_indices, sentence_sums = zip(*sentence_sums_with_indices)
return original_indices, np.array(sentence_sums)
def sif_embeddings(tokenized, model):
# SIF weighting param a has default value
a = 1e-3
# Lowercase & split tokenized sentences
preprocessed = [sentence.lower().split(' ')
for sentence in tokenized]
# Punctuation removal
preprocessed = [[word.translate(str.maketrans('', '', string.punctuation))
for word in sentence]
for sentence in preprocessed]
# Remove OOV words and multiply by smooth word emission probability
vectorized = [[model.model[word]* (a / (a + model.freq_dict[word])) for word in sentence
if word in model.model.vocab]
for sentence in preprocessed]
sentence_means_with_indices = [(index, np.mean(s, axis=0))
for index, s in enumerate(vectorized)
if len(s) > 0]
# With this, we can obtain original sentences by doing sentences[original_indices[index_of_vector]]
original_indices, sentence_means = zip(*sentence_means_with_indices)
# Remove first principal component
sentence_means = np.array(sentence_means)
sentence_means = sentence_means - sentence_means.dot(model.pc.transpose()) * model.pc
return original_indices, sentence_means