forked from shouxieai/word_2_vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathword2vec_负采样_优化.py
117 lines (81 loc) · 3.26 KB
/
word2vec_负采样_优化.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import pandas as pd
import jieba
import numpy as np
import re
from tqdm import tqdm
import pickle
def get_data(file = "new_data.txt"):
with open(file,encoding='utf-8') as f:
datas = f.read().split("\n")
word_2_index = {}
index_2_word = []
s_table = []
for sentence in datas:
words = sentence.split(" ")
for w in words:
if w not in word_2_index:
word_2_index[w] = len(word_2_index)
index_2_word.append(w)
s_table.append(word_2_index[w])
return datas , word_2_index,index_2_word,s_table
def softmax(x):
max_x = np.sum(x,axis = 1,keepdims = True)
ex = np.exp(x)
result = ex / np.sum(ex,axis = 1,keepdims = True)
return np.clip(result,1e-20,1)
def sigmoid(x):
x = np.clip(x,-50,50)
return 1/(1+np.exp(-x))
def make_samples(sentence,index):
global word_2_index,corpus_len, neg_rate,s_table
now_word_index = word_2_index[sentence[index]] # [ ]
other_words = sentence[max(0, index - n_gram): index] + sentence[index + 1: index + n_gram + 1]
other_words_index = [word_2_index[i] for i in other_words]
# all_neg_index = [i for i in range(corpus_len) if i not in other_words_index + [now_word_index]]
# s_table
t = np.random.randint(0,len(s_table),size = (neg_rate*len(other_words_index)))
t = [i for i in t if i not in other_words_index + [now_word_index]]
samples = [ ]
for i in other_words_index:
samples.append((now_word_index,i,1))
for i in t:
samples.append((now_word_index,s_table[i],0))
return pro_samples(samples)
def pro_samples(samples):
# now_word_index = []
other_word_index = []
label = []
for sample in samples:
# now_word_index.append(sample[0])
other_word_index.append(sample[1])
label.append(sample[2])
return samples[0][0],other_word_index,np.array(label).reshape(1,-1)
if __name__ == "__main__":
all_datas, word_2_index,index_2_word,s_table = get_data()
corpus_len = len(word_2_index)
embedding_num = 50
epoch = 4
lr = 0.01
n_gram = 3
neg_rate = 5
w1 = np.random.normal(0,1,size = (corpus_len,embedding_num))
w2 = np.random.normal(0,1,size = w1.T.shape)
# skip_gram
for e in range(epoch):
for sentence in tqdm(all_datas):
sentence =sentence.split(" ")
for now_idx_sent,now_word in enumerate(sentence):
now_word_index,other_word_index,label = make_samples(sentence,now_idx_sent)
# for now_word_index,other_word_index,label in samples:
hidden = 1 * w1[None,now_word_index]
pre = hidden @ w2[:,other_word_index]
pro = sigmoid(pre)
# loss = -np.sum(label * np.log(pro) + (1-label) * np.log(1-pro))
G2 = pro - label
delta_w2 = hidden.T @ G2
G1 = G2 @ w2[:,other_word_index].T
delta_w1 = G1
w1[None,now_word_index] -= lr * delta_w1
w2[:,other_word_index] -= lr * delta_w2 / len(label)
with open("vec.pkl2","wb") as f:
pickle.dump([w1,w2,word_2_index,index_2_word],f)