forked from NULL-JC/Text_Analysis
-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathloadModule.py
99 lines (81 loc) · 2.65 KB
/
loadModule.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
# from myDataset import MyDataset
import paddlehub as hub
from paddlehub.dataset.base_nlp_dataset import BaseNLPDataset
import pandas as pd
import numpy as np
class MyDataset(BaseNLPDataset):
"""DemoDataset"""
def __init__(self):
# 数据集存放位置
self.dataset_dir = "data"
super(MyDataset, self).__init__(
base_path=self.dataset_dir,
train_file="train2.txt",
dev_file="valid2.txt",
test_file="valid2.txt",
train_file_with_header=False,
dev_file_with_header=False,
test_file_with_header=False,
# 数据集类别集合
label_list=["-1", "0", "1"])
module = hub.Module(name="ernie")
dataset=MyDataset()
# 构建Reader
reader = hub.reader.ClassifyReader(
dataset=dataset,
vocab_path=module.get_vocab_path(),
sp_model_path=module.get_spm_path(),
word_dict_path=module.get_word_dict_path(),
max_seq_len=128)
# finetune策略
strategy = hub.AdamWeightDecayStrategy(
weight_decay=0.01,
warmup_proportion=0.1,
learning_rate=5e-5)
# 运行配置
config = hub.RunConfig(
use_cuda=True,
use_data_parallel=False,
num_epoch=1,
checkpoint_dir="model",
batch_size=5,
eval_interval=100,
enable_memory_optim=True,
strategy=strategy)
# Finetune Task
inputs, outputs, program = module.context(
trainable=True, max_seq_len=128)
# Use "pooled_output" for classification tasks on an entire sentence.
pooled_output = outputs["pooled_output"]
feed_list = [
inputs["input_ids"].name,
inputs["position_ids"].name,
inputs["segment_ids"].name,
inputs["input_mask"].name,
]
cls_task = hub.TextClassifierTask(
data_reader=reader,
feature=pooled_output,
feed_list=feed_list,
num_classes=dataset.num_labels,
config=config,
metrics_choices=["f1"])
# finetune
run_states = cls_task.finetune_and_eval()
inv_label_map = {val: key for key, val in reader.label_map.items()}
# Data to be prdicted
test = pd.read_csv('data/nCov_10k_test_u.csv', engine ='python',encoding='utf-8')
data = test[['微博中文内容']].fillna(' ').values.tolist()
run_states = cls_task.predict(data=data)
results = [run_state.run_results for run_state in run_states]
# 生成预测结果
proba = np.vstack([r[0] for r in results])
prediction = list(np.argmax(proba, axis=1))
prediction = [inv_label_map[p] for p in prediction]
submission = pd.DataFrame()
submission['id'] = test['微博id'].values
submission['id'] = submission['id'].astype(str) + ' '
submission['y'] = prediction
np.save('data/proba.npy', proba)
submission.to_csv('data/result.csv', index=False)
submission.head()