-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathread_CD_file.py
86 lines (79 loc) · 3.27 KB
/
read_CD_file.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
import numpy as np
def read_CD_file(filename, conll_filename):
with open(filename, 'r') as infile1, open(conll_filename) as infile2:
sentence = {}
cues = []
mw_cues = []
scopes = {}
events = {}
line_counter = 0
counter = 0
cue_counter = 0
prev_cue_column = -1
instances = []
for line in infile1:
conll_line = infile2.readline()
token_dict = {}
tokens = line.split()
conll_tokens = conll_line.split()
#check for sentence end
if len(tokens) == 0:
for key in sentence:
head_index = int(sentence[key]['head']) - 1
if head_index > -1:
sentence[key]['head-pos'] = sentence[head_index][5]
else:
sentence[key]['head-pos'] = sentence[key][5]
if(len(scopes) != len(cues)):
for i in range(len(cues)):
if not i in scopes:
scopes[i] = []
sentence['cues'] = cues
sentence['mw_cues'] = mw_cues
sentence['scopes'] = scopes
sentence['events'] = events
if len(cues) > 0:
sentence['neg'] = True
else:
sentence['neg'] = False
instances.append(sentence)
sentence = {}
counter = 0
prev_cue_column = -1
cues = []
mw_cues = []
scopes = {}
events = {}
line_counter += 1
continue
for i in range(len(tokens)):
if tokens[i] != "_" and i < 6:
token_dict[i] = tokens[i]
elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-1) % 3 == 0:
if i == prev_cue_column:
cues[-1][2] = 'm'
prev_cue_column = i
if cues[-1][2] == 'm':
mw_cues.append([cues[-1][0],cues[-1][1]])
mw_cues.append([tokens[i], counter])
elif tokens[i] != tokens[3]:
cues.append([tokens[i], counter, 'a'])
prev_cue_column = i
else:
cues.append([tokens[i], counter, 's'])
prev_cue_column = i
elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-2) % 3 == 0:
cue_counter = (i-8)/3
if cue_counter in scopes:
scopes[cue_counter].append([tokens[i], counter])
else:
scopes[cue_counter] = [[tokens[i], counter]]
elif tokens[i] != "***" and tokens[i] != "_" and i > 6 and (i-3) % 3 == 0:
cue_counter = (i-9)/3
events[cue_counter] = tokens[i]
token_dict['head'] = conll_tokens[6]
token_dict['deprel'] = conll_tokens[7]
sentence[counter] = token_dict
counter += 1
line_counter += 1
return instances