-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmake_merged_data_ready_for_evaluation.py
53 lines (41 loc) · 1.64 KB
/
make_merged_data_ready_for_evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
from utils import read_file
from utils import write_file
in_filepaths = [
'../dataset/merged_data_subtask1_train.tsv',
'../dataset/merged_data_subtask2_train.tsv',
'../dataset/merged_data_subtask1_test.tsv',
'../dataset/merged_data_subtask2_test.tsv',
]
out_filepaths = [
'merged_data_subtask1_train_ready_for_evaluation.tsv',
'merged_data_subtask2_train_ready_for_evaluation.tsv',
'merged_data_subtask1_test_ready_for_evaluation.tsv',
'merged_data_subtask2_test_ready_for_evaluation.tsv',
]
for in_fp, out_fp in zip(in_filepaths, out_filepaths):
tsv_lines = read_file(in_fp).splitlines()[1:]
#
out_tsv = '{}\t{}\t{}\t{}\t{}\n'.format('doc_id', 'start_span', 'end_span', 'entity_type', 'code')
#
for line in tsv_lines:
if 'subtask1' in in_fp:
filename, ann_id, label, start_span, end_span, text = line.strip().split('\t')
out_tsv += '{}\t{}\t{}\t{}\t{}\n'.format(filename, start_span, end_span, label, 'NO_CODE')
else:
assert 'subtask2' in in_fp
filename, label, start_span, end_span, text, code = line.strip().split('\t')
out_tsv += '{}\t{}\t{}\t{}\t{}\n'.format(filename, start_span, end_span, label, code)
#
write_file(out_fp, out_tsv)
print(
'''
Success!
The following files were created:
merged_data_subtask1_train_ready_for_evaluation.tsv
merged_data_subtask2_train_ready_for_evaluation.tsv
merged_data_subtask1_test_ready_for_evaluation.tsv
merged_data_subtask2_test_ready_for_evaluation.tsv
These gold standard files can be used in the evaluation.py script.
''')