-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathconlleval.py
287 lines (249 loc) · 11.7 KB
/
conlleval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#!/usr/bin/python
#### Original Perl Script
# conlleval: evaluate result of processing CoNLL-2000 shared task
# usage: conlleval [-l] [-r] [-d delimiterTag] [-o oTag] < file
# README: http://cnts.uia.ac.be/conll2000/chunking/output.html
# options: l: generate LaTeX output for tables like in
# http://cnts.uia.ac.be/conll2003/ner/example.tex
# r: accept raw result tags (without B- and I- prefix;
# assumes one word per chunk)
# d: alternative delimiter tag (default is single space)
# o: alternative outside tag (default is O)
# note: the file should contain lines with items separated
# by $delimiter characters (default space). The final
# two items should contain the correct tag and the
# guessed tag in that order. Sentences should be
# separated from each other by empty lines or lines
# with $boundary fields (default -X-).
# url: http://lcg-www.uia.ac.be/conll2000/chunking/
# started: 1998-09-25
# version: 2004-01-26
# author: Erik Tjong Kim Sang <[email protected]>
#### Now in Python
# author: sighsmile.github.io
# version: 2017-05-18
from __future__ import division, print_function, unicode_literals
import argparse
import sys
import re
from collections import defaultdict
# sanity check
def parse_args():
argparser = argparse.ArgumentParser()
argparser.add_argument(
"-l", "--latex",
default=False, action="store_true",
help="generate LaTeX output"
)
argparser.add_argument(
"-r", "--raw",
default=False, action="store_true",
help="accept raw result tags"
)
argparser.add_argument(
"-d", "--delimiter",
default=None,
help="alternative delimiter tag (default: single space)"
)
argparser.add_argument(
"-o", "--oTag",
default="O",
help="alternative delimiter tag (default: O)"
)
args = argparser.parse_args()
return args
"""
• IOB1: I is a token inside a chunk, O is a token outside a chunk and B is the
beginning of chunk immediately following another chunk of the same Named Entity.
• IOB2: It is same as IOB1, except that a B tag is given for every token, which exists at
the beginning of the chunk.
• IOE1: An E tag used to mark the last token of a chunk immediately preceding another
chunk of the same named entity.
• IOE2: It is same as IOE1, except that an E tag is given for every token, which exists at
the end of the chunk.
• START/END: This consists of the tags B, E, I, S or O where S is used to represent a
chunk containing a single token. Chunks of length greater than or equal to two always
start with the B tag and end with the E tag.
• IO: Here, only the I and O labels are used. This therefore cannot distinguish between
adjacent chunks of the same named entity.
"""
# endOfChunk: checks if a chunk ended between the previous and current word
# arguments: previous and current chunk tags, previous and current types
# note: this code is capable of handling other chunk representations
# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
def endOfChunk(prevTag, tag, prevType, type):
"""
checks if a chunk ended between the previous and current word;
arguments: previous and current chunk tags, previous and current types
"""
return ((prevTag == "B" and tag == "B") or
(prevTag == "B" and tag == "O") or
(prevTag == "I" and tag == "B") or
(prevTag == "I" and tag == "O") or
(prevTag == "E" and tag == "E") or
(prevTag == "E" and tag == "I") or
(prevTag == "E" and tag == "O") or
(prevTag == "I" and tag == "O") or
(prevTag != "O" and prevTag != "." and prevType != type) or
(prevTag == "]" or prevTag == "["))
# corrected 1998-12-22: these chunks are assumed to have length 1
# startOfChunk: checks if a chunk started between the previous and current word
# arguments: previous and current chunk tags, previous and current types
# note: this code is capable of handling other chunk representations
# than the default CoNLL-2000 ones, see EACL'99 paper of Tjong
# Kim Sang and Veenstra http://xxx.lanl.gov/abs/cs.CL/9907006
def startOfChunk(prevTag, tag, prevType, type):
"""
checks if a chunk started between the previous and current word;
arguments: previous and current chunk tags, previous and current types
"""
chunkStart = ((prevTag == "B" and tag == "B") or
(prevTag == "B" and tag == "B") or
(prevTag == "I" and tag == "B") or
(prevTag == "O" and tag == "B") or
(prevTag == "O" and tag == "I") or
(prevTag == "E" and tag == "E") or
(prevTag == "E" and tag == "I") or
(prevTag == "O" and tag == "E") or
(prevTag == "O" and tag == "I") or
(tag != "O" and tag != "." and prevType != type) or
(tag == "]" or tag == "["))
# corrected 1998-12-22: these chunks are assumed to have length 1
#print("startOfChunk?", prevTag, tag, prevType, type)
#print(chunkStart)
return chunkStart
def calcMetrics(TP, P, T, percent=True):
"""
compute overall precision, recall and FB1 (default values are 0.0)
if percent is True, return 100 * original decimal value
"""
precision = TP / P if P else 0
recall = TP / T if T else 0
FB1 = 2 * precision * recall / (precision + recall) if precision + recall else 0
if percent:
return 100 * precision, 100 * recall, 100 * FB1
else:
return precision, recall, FB1
def splitTag(chunkTag, oTag = "O", raw = False):
"""
Split chunk tag into IOB tag and chunk type;
return (iob_tag, chunk_type)
"""
if chunkTag == "O" or chunkTag == oTag:
tag, type = "O", None
elif raw:
tag, type = "B", chunkTag
else:
try:
# split on first hyphen, allowing hyphen in type
tag, type = chunkTag.split('-', 1)
except ValueError:
tag, type = chunkTag, None
return tag, type
def countChunks(fileIterator, args):
"""
Process input in given format and count chunks using the last two columns;
return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter
"""
boundary = "-X-" # sentence boundary
delimiter = args.delimiter
raw = args.raw
oTag = args.oTag
correctChunk = defaultdict(int) # number of correctly identified chunks
foundCorrect = defaultdict(int) # number of chunks in corpus per type
foundGuessed = defaultdict(int) # number of identified chunks per type
tokenCounter = 0 # token counter (ignores sentence breaks)
correctTags = 0 # number of correct chunk tags
lastType = None # temporary storage for detecting duplicates
inCorrect = False # currently processed chunk is correct until now
lastCorrect, lastCorrectType = "O", None # previous chunk tag in corpus
lastGuessed, lastGuessedType = "O", None # previously identified chunk tag
for line in fileIterator:
# each non-empty line must contain >= 3 columns
features = line.strip().split(delimiter)
if not features or features[0] == boundary:
features = [boundary, "O", "O"]
elif len(features) < 3:
raise IOError("conlleval: unexpected number of features in line %s\n" % line)
# extract tags from last 2 columns
guessed, guessedType = splitTag(features[-1], oTag=oTag, raw=raw)
correct, correctType = splitTag(features[-2], oTag=oTag, raw=raw)
# 1999-06-26 sentence breaks should always be counted as out of chunk
firstItem = features[0]
if firstItem == boundary:
guessed, guessedType = "O", None
# decide whether current chunk is correct until now
if inCorrect:
endOfGuessed = endOfChunk(lastCorrect, correct, lastCorrectType, correctType)
endOfCorrect = endOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
if (endOfGuessed and endOfCorrect and lastGuessedType == lastCorrectType):
inCorrect = False
correctChunk[lastCorrectType] += 1
elif ( endOfGuessed != endOfCorrect or guessedType != correctType):
inCorrect = False
startOfGuessed = startOfChunk(lastGuessed, guessed, lastGuessedType, guessedType)
startOfCorrect = startOfChunk(lastCorrect, correct, lastCorrectType, correctType)
if (startOfCorrect and startOfGuessed and guessedType == correctType):
inCorrect = True
if startOfCorrect:
foundCorrect[correctType] += 1
if startOfGuessed:
foundGuessed[guessedType] += 1
if firstItem != boundary:
if correct == guessed and guessedType == correctType:
correctTags += 1
tokenCounter += 1
lastGuessed, lastGuessedType = guessed, guessedType
lastCorrect, lastCorrectType = correct, correctType
if inCorrect:
correctChunk[lastCorrectType] += 1
return correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter
def evaluate(correctChunk, foundGuessed, foundCorrect, latex=False):
# sum counts
correctChunkSum = sum(correctChunk.values())
foundGuessedSum = sum(foundGuessed.values())
foundCorrectSum = sum(foundCorrect.values())
# sort chunk type names
sortedTypes = list(foundCorrect) + list(foundGuessed)
sortedTypes = list(set(sortedTypes))
sortedTypes.sort()
# print overall performance, and performance per chunk type
if not latex:
# compute overall precision, recall and FB1 (default values are 0.0)
precision, recall, FB1 = calcMetrics(correctChunkSum, foundGuessedSum, foundCorrectSum)
# print overall performance
print("processed %i tokens with %i phrases; " % (tokenCounter, foundCorrectSum), end='')
print("found: %i phrases; correct: %i.\n" % (foundGuessedSum, correctChunkSum), end='')
if tokenCounter:
print("accuracy: %6.2f%%; " % (100*correctTags/tokenCounter), end='')
print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" %
(precision, recall, FB1))
for i in sortedTypes:
precision, recall, FB1 = calcMetrics(correctChunk[i], foundGuessed[i], foundCorrect[i])
print("%17s: " %i , end='')
print("precision: %6.2f%%; recall: %6.2f%%; FB1: %6.2f" %
(precision, recall, FB1), end='')
print(" %d" % foundGuessed[i])
# generate LaTeX output for tables like in
# http://cnts.uia.ac.be/conll2003/ner/example.tex
else:
print(" & Precision & Recall & F\$_{\\beta=1} \\\\\\hline", end='')
for i in sortedTypes:
precision, recall, FB1 = calcMetrics(correctChunk[i], foundGuessed[i], foundCorrect[i])
print("\n%-7s & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\" %
(i,precision,recall,FB1), end='')
print("\\hline")
precision, recall, FB1 = calcMetrics(correctChunkSum, foundGuessedSum, foundCorrectSum)
print("Overall & %6.2f\\%% & %6.2f\\%% & %6.2f \\\\\\hline" %
(precision,recall,FB1))
if __name__ == "__main__":
args = parse_args()
# process input and count chunks
correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter = countChunks(sys.stdin, args)
#file_object = open('neuroner_folia_out.txt')
#lines = file_object.readlines()
#correctChunk, foundGuessed, foundCorrect, correctTags, tokenCounter = countChunks(lines, args)
# compute metrics and print
evaluate(correctChunk, foundGuessed, foundCorrect, latex=args.latex)
sys.exit(0)