-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgenerate_w2v.py
34 lines (29 loc) · 1.04 KB
/
generate_w2v.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
# -*- coding: utf-8 -*-
import os
import sys
import time
import numpy as np
import gensim
def load_data(data_path):
train_data = []
with open(data_path, 'r', encoding='utf-8') as f:
for index, line in enumerate(f.readlines()):
if index == 0:
continue
line = line.strip().split(',')
if len(line) != 3:
continue
content = line[1]
train_data.append(content.strip().split())
return train_data
def train_w2v(train_data, model_path):
start_time = time.time()
model = gensim.models.Word2Vec(train_data, size=200, window=5, min_count=0, workers=3, iter=10)
print('train done, time used {:.4f} min.'.format((time.time() - start_time) / 60))
print(len(model.wv.vocab))
model.wv.save_word2vec_format(model_path, binary=False)
if __name__ == "__main__":
train_data = load_data('../text_data/raw_data/train.csv')
print(len(train_data))
print(train_data[:3])
train_w2v(train_data, '../text_data//w2v_model/text_w2v_model.txt')