-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathaceconv2.py
266 lines (225 loc) · 8.12 KB
/
aceconv2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
from lxml import etree
import xml.etree.ElementTree as ET
import sys
def xmltointernam(xmlfile, rawfile, entypes):
tree = ET.parse(xmlfile)
treeraw = etree.parse(rawfile)
rootraw = treeraw.getroot()
root = tree.getroot()
names = []
inds = []
typel = []
for doc in root.findall('document'):
for entity in doc.findall('entity'):
type1 = entity.get('TYPE')
if type1 in entypes:
mentions = entity.findall('entity_mention')
for ment in mentions:
mentype1 = ment.get("TYPE")
if mentype1 == "NAM":
extent1 = ment.find("head")
charseq1 = extent1.find("charseq")
names.append(charseq1.text)
inds.append(int(charseq1.get('START')) - 1)
typel.append(type1)
body = rootraw.find("BODY")
tex = [x for x in rootraw.itertext()]
tex[0] = tex[0][1:]
text1 = ""
for t in tex:
text1 = text1 + t
text1 = text1.replace("\n", " ")
return text1, names, inds, typel
##given char start indexes
##returns the corresponding token index of the given token
def chartotokinds(text1, inds):
tokids = []
for i in range(len(inds)):
str1 = text1[:inds[i] - 1]
str1s = str1.split()
tokids.append(len(str1s))
return tokids
## for multi-word entities
## add the indexes of the non-starting tokens
## in order to tag them as well
def expandtokinds(tokinds, toks, typel):
exptokinds = []
exptypel = []
for i in range(len(tokinds)):
len1 = len(toks[i].split())
for l in range(len1):
exptokinds.append(tokinds[i] + l)
exptypel.append(typel[i])
return exptokinds, exptypel
## given as input the .apf.xml and .sgm files
## outputs the document in wapiti format
## by only considering the entity types in the 'types' list
## if outtyp parameter is 0 then all entities are tagged as '1' and 0 o/w
## else every entity is tagged with its own type
def xmltowapiti(xmlfile, rawfile, types, outfile, outtyp):
text1, ents, inds, typel = xmltointernam(xmlfile, rawfile, types)
tokinds = chartotokinds(text1, inds)
exptokinds, exptypel = expandtokinds(tokinds, ents, typel)
out = open(outfile, "a")
ts = text1.split()
for i in range(len(ts)):
out.write(ts[i] + " ")
ind1 = findind(exptokinds, i)
if ind1 != -1:
if outtyp == 0:
out.write("1\n")
else:
out.write(exptypel[ind1] + "\n")
else:
out.write("O \n")
##in order this function to work
## the address of the file.tbl file must be givene
## returns the names of the files in the adj folders for English
def filenamelist(base, filelistfile):
filenames = open(filelistfile).readlines()
englishnames = []
for name in filenames:
if "data/English" in name and ".apf.xml" in name and ".score" not in name and "adj" in name:
englishnames.append(base + name[:name.find(".apf.xml")])
return englishnames
##converts & in documents to &
## this step is required for the .sgm files
## output files have the suffix .sgm1
def andsignupdate(filenames):
for name in filenames:
name1 = name + ".sgm"
read1 = open(name1).read()
write1 = open(name1 + "1", "w")
for x in read1:
if x == "&":
write1.write("&")
else:
write1.write(x)
##for all English documents
## create corpus in wapiti format
def tokenperlineCorpus(base, filenames, outfilename, tagver, tagtypes):
count = 0
for doc in filenames:
name = doc
# print(doc)
count += 1
xmltowapiti(name + ".apf.xml", name + ".sgm1", tagtypes, outfilename, tagver)
print("%d files are converted" % count)
##returns the position of a given value in a list
def findind(l1, ind1):
for i in range(len(l1)):
if ind1 == l1[i]:
return i
return -1
def ACEtoTokenPerLine(base, filelistfile, outfilename, tagtypes, tagver):
filenames = filenamelist(base, filelistfile)
andsignupdate(filenames)
tokenperlineCorpus(base, filenames, outfilename, tagver, tagtypes)
def defaultaddr():
base1 = "ACE/aceCorp/"
filelistfile1 = "ACE/aceCorp/docs/file.tbl"
outfilename1 = "acetowapCorpus2"
tagtypes1 = ["ORG", "LOC", "GPE", "PER"]
tagver1 = 0
## given filenames and a token-per-line corpus
## finds the indexes of the metadata of each file as they cause noise in the data
## for now only takes into account DOCID DOCTYPE DATETIME BODY-HEADLINE
## do not take into account other metadata such as speaker name etc.
def metadatainds(filenames, corpusname):
filenames = filenamelist("ACE/aceCorp/", filenames)
corpus = open(corpusname).readlines()
count = 0
indslist = [] ##stores all the indexes to be deleted
for name in filenames:
tree = etree.parse(name + ".sgm1")
root = tree.getroot()
docid = root.find("DOCID").text
doctype = root.find("DOCTYPE").text
doctime = root.find("DATETIME").text
body = root.find("BODY")
headl = body.find("HEADLINE")
metadata = []
metadata.append(docid)
metadata.append(doctype)
metadata.append(doctime)
if headl != None:
count += 1
# print(headl.text)
metadata.append(headl.text)
metadatas = [] ##metadata in splitted array format
for md in metadata:
for mds in md.split():
metadatas.append(mds)
f = findfirstocc(metadatas, corpus)
# print(metadatas)
inds = [f + x for x in range(len(metadatas))]
for i in inds:
indslist.append(i)
return indslist
# print(metadatas)
##copies the corpusname file to outname
## without copying the lines with indexes in inds
## used for deleting metadata can be used for other purposes
def deletedata(corpusname, inds, outname):
corp = open(corpusname).readlines()
outfile = open(outname, "w")
for i in range(len(corp)):
if i not in inds:
outfile.write(corp[i])
# deletes metadatas using corpus and files in the filenamelist
def deletemetadata(corpusname, filenamelist, outputfilename):
metainds = metadatainds(filenamelist, corpusname)
deletedata(corpusname, metainds, outputfilename)
##given a list of tokens
## finds their first consecutive occurrence
## used for deleting
def findfirstocc(metadata, corpus):
for i in range(len(corpus)):
c = 0
for j in range(len(metadata)):
lines = corpus[i + j].split()
if lines[0] != metadata[j]:
break
else:
c += 1
if c == len(metadata):
return i
return -1
def assignvars(args):
tagtypes1 = []
base1 = args[1]
filelistfile1 = args[2]
outfilename1 = args[3]
for x in args[4:-1]:
tagtypes1.append(x)
tagver1 = args[-1]
base1 = "ACE/aceCorp/"
filelistfile1 = "ACE/aceCorp/docs/file.tbl"
outfilename1 = "acetotokenCorpus"
tagtypes1 = ["ORG", "LOC", "GPE"]
tagver1 = 0
args = sys.argv
if len(args) > 1:
if args[1] == "-h":
print("python3 aceconv.py base filelistfile outfilename tagtypes tagver | for custom mode")
print("python3 aceconv.py def | for default mode")
elif args[1] == "def":
print("running on default mode without metadata")
print("base: ACE/aceCorp/")
print("filelist address: ACE/aceCorp/docs/file.tbl")
print("output file name: wometaCorpus")
print("Tag types: ORG LOC GPE")
print("Tagging version: binary")
ACEtoTokenPerLine(base1, filelistfile1, outfilename1, tagtypes1, tagver1)
deletemetadata(outfilename1, filelistfile1, "wometaCorpus")
else:
tagtypes1 = []
base1 = args[1]
filelistfile1 = args[2]
outfilename1 = args[3]
for x in args[4:-1]:
tagtypes1.append(x)
tagver1 = args[-1]
if base1[-1] != "/":
base1 += "/"
ACEtoTokenPerLine(base1, filelistfile1, outfilename1, tagtypes1, tagver1)