-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathdata.py
39 lines (35 loc) · 1.18 KB
/
data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
from os.path import join
from codecs import open
def build_corpus(split, make_vocab=True, data_dir="./dataset/old"):
"""读取数据"""
assert split in ['train', 'dev', 'test']
word_lists = []
tag_lists = []
with open(join(data_dir, split+".char.txt"), 'r', encoding='utf-8') as f:
word_list = []
tag_list = []
for line in f:
line = line.replace(" "," ")
if line.strip() != '':
word,tag= line.strip('\n').split(" ")[0],line.strip('\n').split(" ")[1]
word_list.append(word)
tag_list.append(tag)
else:
word_lists.append(word_list)
tag_lists.append(tag_list)
word_list = []
tag_list = []
# 如果make_vocab为True,还需要返回word2id和tag2id
if make_vocab:
word2id = build_map(word_lists)
tag2id = build_map(tag_lists)
return word_lists, tag_lists, word2id, tag2id
else:
return word_lists, tag_lists
def build_map(lists):
maps = {}
for list_ in lists:
for e in list_:
if e not in maps:
maps[e] = len(maps)
return maps