forked from shouxieai/word_2_vec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmy_word2vec.py
91 lines (67 loc) · 2.57 KB
/
my_word2vec.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
import pandas as pd
import pickle
import jieba
import os
from tqdm import tqdm
def load_stop_words(file = "stopwords.txt"):
with open(file,"r",encoding = "utf-8") as f:
return f.read().split("\n")
def cut_words(file="数学原始数据.csv"):
stop_words = load_stop_words()
result = []
all_data = pd.read_csv(file,encoding = "gbk",names=["data"])["data"]
for words in all_data:
c_words = jieba.lcut(words)
result.append([word for word in c_words if word not in stop_words])
return result
def get_dict(data):
index_2_word = []
for words in data:
for word in words:
if word not in index_2_word:
index_2_word.append(word)
word_2_index = {word:index for index,word in enumerate(index_2_word)}
word_size = len(word_2_index)
word_2_onehot = {}
for word,index in word_2_index.items():
one_hot = np.zeros((1,word_size))
one_hot[0,index] = 1
word_2_onehot[word] = one_hot
return word_2_index,index_2_word,word_2_onehot
def softmax(x):
ex = np.exp(x)
return ex/np.sum(ex,axis = 1,keepdims = True)
if __name__ == "__main__":
data = cut_words()
word_2_index, index_2_word, word_2_onehot = get_dict(data)
word_size = len(word_2_index)
embedding_num = 107
lr = 0.01
epoch = 10
n_gram = 3
w1 = np.random.normal(-1,1,size = (word_size,embedding_num))
w2 = np.random.normal(-1,1,size = (embedding_num,word_size))
for e in range(epoch):
for words in tqdm(data):
for n_index,now_word in enumerate(words):
now_word_onehot = word_2_onehot[now_word]
other_words = words[max(n_index-n_gram,0):n_index] + words[n_index+1 : n_index+1+n_gram]
for other_word in other_words:
other_word_onehot = word_2_onehot[other_word]
hidden = now_word_onehot @ w1
p = hidden @ w2
pre = softmax(p)
# loss = -np.sum(other_word_onehot * np.log(pre))
# A @ B = C
# delta_C = G
# delta_A = G @ B.T
# delta_B = A.T @ G
G2 = pre - other_word_onehot
delta_w2 = hidden.T @ G2
G1 = G2 @ w2.T
delta_w1 = now_word_onehot.T @ G1
w1 -= lr * delta_w1
w2 -= lr * delta_w2
with open("word2vec.pkl","wb") as f:
pickle.dump([w1,word_2_index,index_2_word],f) # word2vec 负采样