-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathUDlextag2json.py
executable file
·436 lines (384 loc) · 20.8 KB
/
UDlextag2json.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
#!/usr/bin/env python3
import json
import re
import sys
from argparse import ArgumentParser, FileType
from collections import defaultdict
from itertools import chain
from lexcatter import supersenses_for_lexcat, ALL_LEXCATS
from mwerender import render, render_sent
from streuseval import parse_mwe_links, form_groups
from supersenses import ancestors
from tagging import sent_tags
desc = \
"""
Defines a function to read a .UDlextag file sentence-by-sentence into a data structure,
unpacking the lextags into structured lexical annotations.
If the script is called directly, outputs the data as JSON.
Adapted from conllulex2json.py.
See conllulex2UDlextag.py for an explanation of the .UDlextag format.
@author: Nathan Schneider (@nschneid)
@since: 2019-06-20
"""
def load_sents(inF, morph_syn=True, misc=True, ss_mapper=None, validate_pos=True, validate_type=True):
"""Given a .UDlextag file (or iterable over lines), return an iterator over sentences.
@param morph_syn: Whether to include CoNLL-U morphological features
and syntactic dependency relations, if available.
POS tags and lemmas are always included.
@param misc: Whether to include the CoNLL-U miscellaneous column, if available.
@param ss_mapper: A function to apply to supersense labels to replace them
in the returned data structure. Applies to all supersense labels (nouns,
verbs, prepositions). Not applied if the supersense slot is empty.
@param validate_pos: Validate consistency of lextag with UPOS
@param validate_type: Validate SWE-specific or SMWE-specific tags only apply to the corresponding MWE type
"""
lc_tbd = 0
def _unpack_lextags(sent):
"""At this point the sentence will be a collection of tokens, which will have
lextags but no other STREUSLE info. The parts of the lextag have been parsed into tok['_lextag']."""
# Infer MWE groupings from lextags
mweflags = [tok['_lextag']['mweflag'] for tok in sent['toks'] if '_lextag' in tok]
mweflags = ['O'] + mweflags # so token offsets in groups are 1-based
links = parse_mwe_links(mweflags)
sgroups = form_groups([(a,b) for a,b,s in links if s=='_'])
maxgroups = form_groups([(a,b) for a,b,s in links]) # maximal groups: treat weak links like strong links
wgroups = [g for g in maxgroups if g not in sgroups]
# Register strong, then weak MWEs in data structure
# Ordering MWEs by first token offset (tiebreaker to strong MWE):
xgroups = [(min(sg),'s',sg) for sg in sgroups] + [(min(wg),'w',wg) for wg in wgroups]
mwe_group = 1
for tok1Num,x,g in sorted(xgroups):
sent[x+'mwes'][mwe_group]['lexlemma'] = ''
for mwe_position,tokNum in enumerate(sorted(g), start=1):
sent['toks'][tokNum-1][x+'mwe'] = mwe_group, mwe_position
sent[x+'mwes'][mwe_group]['toknums'].append(tokNum)
if sent['toks'][tokNum-1]['lemma']!='_': # goeswith is the exception
sent[x+'mwes'][mwe_group]['lexlemma'] += ' ' + sent['toks'][tokNum-1]['lemma']
sent[x+'mwes'][mwe_group]['lexlemma'] = sent[x+'mwes'][mwe_group]['lexlemma'][1:] # delete leading space
#assert ' ' in sent[x+'mwes'][mwe_group]['lexlemma'] # TODO: may need to be commented out to allow for goeswith MWEs
mwe_group += 1
del mwe_group
# Deal with single-word expressions, and move lexcat/supersenses
# from the token to the lexical expression
for tok in sent['toks']:
assert '_lextag' in tok
if not tok['smwe']: # token not part of a strong MWE
tokNum = tok['#']
sent['swes'][tokNum]['lexlemma'] = tok['lemma']
assert ' ' not in sent['swes'][tokNum]['lexlemma']
sent['swes'][tokNum]['toknums'].append(tokNum)
if tok['wmwe'] and tok['wmwe'][1]==1: # first token in weak MWE
#assert tok['wcat'] and tok['wcat']!='_' # eventually it would be good to have a category for every weak expression
sent['wmwes'][tok['wmwe'][0]]['lexcat'] = tok['wcat'] if tok['_lextag'].get('wcat') else None
if tok['_lextag']['lexcat']: # first token in a strong expression (SW or MW)
einfo = tok['_lextag']
assert einfo['lexcat']!='_',einfo
# place to unpack lexcat/supersense info to (lexlemma is already unpacked)
dest = sent['smwes'][tok['smwe'][0]] if tok['smwe'] else sent['swes'][tok['#']]
dest['lexcat'] = einfo['lexcat']
dest['ss'] = ss_mapper(einfo['ss']) if einfo['ss']!='_' else None
dest['ss2'] = ss_mapper(einfo['ss2']) if einfo['ss2']!='_' else None
for swe in sent['swes'].values():
assert len(swe['toknums'])==1,swe
for smwe in sent['smwes'].values():
assert smwe['toknums']
for wmwe in sent['wmwes'].values():
assert wmwe['toknums']
for tok in sent['toks']:
del tok['_lextag']
if not tok['smwe']:
assert sent['swes'][tok['#']]['lexcat'],sent['swes']
else:
assert sent['smwes'][tok['smwe'][0]]['lexcat'],sent['smwes']
def _postproc_sent(sent):
nonlocal lc_tbd
assert 'sent_id' in sent,sent
# check that tokens are numbered from 1, in order
for i,tok in enumerate(sent['toks'], 1):
assert tok['#']==i
# check that MWEs are numbered from 1 based on first token offset
xmwes = [(e["toknums"][0], 's', mwenum) for mwenum,e in sent['smwes'].items()]
xmwes += [(e["toknums"][0], 'w', mwenum) for mwenum,e in sent['wmwes'].items()]
xmwes.sort()
for k,mwe in chain(sent['smwes'].items(), sent['wmwes'].items()):
assert xmwes[int(k)-1][2]==k,f"In {sent['sent_id']}, MWEs are not numbered in the correct order: use normalize_mwe_numbering.py to fix"
# check that lexical & weak MWE lemmas are correct
lexes_to_validate = chain(sent['swes'].values(), sent['smwes'].values()) if validate_type else []
for lexe in lexes_to_validate:
sent['toks'][lexe['toknums'][0]-1]
assert lexe['lexlemma']==' '.join(lem for i in lexe['toknums'] for lem in [sent['toks'][i-1]['lemma']] if lem!='_'),f"In {sent['sent_id']}, MWE lemma is incorrect: {lexe} vs. {sent['toks'][lexe['toknums'][0]-1]}"
lc = lexe['lexcat']
if lc.endswith('!@'): lc_tbd += 1
valid_ss = supersenses_for_lexcat(lc)
if lc=='V':
assert len(lexe['toknums'])==1,f'In {sent["sent_id"]}, Verbal MWE "{lexe["lexlemma"]}" lexcat must be subtyped (V.VID, etc., not V)'
ss, ss2 = lexe['ss'], lexe['ss2']
if valid_ss:
if ss=='??':
assert ss2 is None
elif ss not in valid_ss or (lc in ('N','V') or lc.startswith('V.'))!=(ss2 is None) or (ss2 is not None and ss2 not in valid_ss):
assert False,f"In {sent['sent_id']}, invalid supersense(s) in lexical entry: {lexe}"
elif ss.startswith('p.'):
assert ss2.startswith('p.')
assert ss2 not in {'p.Experiencer', 'p.Stimulus', 'p.Originator', 'p.Recipient', 'p.SocialRel', 'p.Org', 'p.OrgMember', 'p.Ensemble', 'p.QuantityValue'},(f'{ss2} should never be function',lexe)
if ss!=ss2:
ssA, ss2A = ancestors(ss), ancestors(ss2)
# there are just a few permissible combinations where one is the ancestor of the other
if (ss,ss2) not in {('p.Circumstance','p.Locus'), ('p.Circumstance','p.Path'),
('p.Locus','p.Goal'), ('p.Locus','p.Source'),
('p.Characteristic','p.Stuff'),
('p.Whole','p.Gestalt'), ('p.Org', 'p.Gestalt'),
('p.QuantityItem','p.Gestalt'), ('p.Goal','p.Locus')}:
assert ss not in ss2A,f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
assert ss2 not in ssA,f"In {sent['sent_id']}, unexpected construal: {ss} ~> {ss2}"
else:
assert ss is None and ss2 is None and lexe not in ('N', 'V', 'P', 'INF.P', 'PP', 'POSS', 'PRON.POSS'),lexe
# check lexcat on single-word expressions
for swe in sent['swes'].values():
tok = sent['toks'][swe['toknums'][0]-1]
upos, xpos = tok['upos'], tok['xpos']
lc = swe['lexcat']
if lc.endswith('!@'): continue
if lc not in ALL_LEXCATS:
assert not validate_type, f"In {sent['sent_id']}, invalid lexcat {lc} for single-word expression '{tok['word']}'"
continue
if validate_pos and upos!=lc and lc!='PP' and (upos,lc) not in {('NOUN','N'),('PROPN','N'),('VERB','V'),
('ADP','P'),('ADV','P'),('SCONJ','P'),
('ADP','DISC'),('ADV','DISC'),('SCONJ','DISC'),
('PART','POSS')}:
# most often, the single-word lexcat should match its upos
# check a list of exceptions
mismatchOK = False
if xpos=='TO' and lc.startswith('INF'):
mismatchOK = True
elif (xpos=='TO')!=lc.startswith('INF'):
assert upos=='SCONJ' and swe['lexlemma']=='for',(sent['sent_id'],swe,tok)
mismatchOK = True
if (upos in ('NOUN', 'PROPN'))!=(lc=='N'):
#try:
assert upos in ('SYM','X') or (lc in ('PRON','DISC')),(sent['sent_id'],swe,tok)
#except AssertionError:
# print('Suspicious lexcat/POS combination:', sent['sent_id'], swe, tok, file=sys.stderr)
mismatchOK = True
message = f"In {sent['sent_id']}, single-word expression '{tok['word']}' has lexcat {lc}, which is incompatible with its upos {upos}"
if (upos=='AUX')!=(lc=='AUX'):
assert tok['lemma']=='be' and lc=='V',message # copula has upos=AUX
mismatchOK = True
if (upos=='VERB')!=(lc=='V'):
if lc=='ADJ':
print('Word treated as VERB in UD, ADJ for supersenses:', sent['sent_id'], tok['word'], file=sys.stderr)
else:
assert tok['lemma']=='be' and lc=='V',message # copula has upos=AUX
mismatchOK = True
if upos=='PRON':
assert lc=='PRON' or lc=='PRON.POSS',message
mismatchOK = True
if lc=='ADV':
assert upos=='ADV' or upos=='PART',message # PART is for negations
mismatchOK = True
if upos=='ADP' and lc=='CCONJ':
assert tok['lemma']=='versus'
mismatchOK = True
assert mismatchOK,message
if validate_type:
assert lc!='PP',f"In {sent['sent_id']}, PP should only apply to strong MWEs, but occurs for single-word expression '{tok['word']}'"
for smwe in sent['smwes'].values():
assert len(smwe['toknums'])>1,smwe
for wmwe in sent['wmwes'].values():
assert len(wmwe['toknums'])>1,f"In {sent['sent_id']}, weak MWE has only one token according to group indices: {wmwe}"
assert wmwe['lexlemma']==' '.join(sent['toks'][i-1]['lemma'] for i in wmwe['toknums']),(sent["sent_id"],wmwe,sent['toks'][wmwe['toknums'][0]-1])
# we already checked that noninitial tokens in an MWE have _ as their lemma
# check lextags
smweGroups = [smwe['toknums'] for smwe in sent['smwes'].values()]
wmweGroups = [wmwe['toknums'] for wmwe in sent['wmwes'].values()]
if 'mwe' not in sent:
sent['mwe'] = render_sent(sent, False, False)
tagging = sent_tags(len(sent['toks']), sent['mwe'], smweGroups, wmweGroups)
for tok,tag in zip(sent['toks'],tagging):
fulllextag = tag
if tok['smwe']:
smweNum, position = tok['smwe']
lexe = sent['smwes'][smweNum]
else:
position = None
lexe = sent['swes'][tok['#']]
if position is None or position==1:
lexcat = lexe['lexcat']
fulllextag += '-'+lexcat
ss1, ss2 = lexe['ss'], lexe['ss2']
if ss1 is not None:
assert ss1
fulllextag += '-'+ss1
if ss2 is not None and ss2!=ss1:
assert ss2
fulllextag += '|'+ss2
if tok['wmwe']:
wmweNum, position = tok['wmwe']
wmwe = sent['wmwes'][wmweNum]
wcat = wmwe['lexcat']
if wcat and position==1:
fulllextag += '+'+wcat
assert tok['lextag']==fulllextag,f"In {sent['sent_id']}, the full tag at the end of the line is inconsistent with the rest of the line ({fulllextag} expected): {tok}"
# check rendered MWE string
s = render([tok['word'] for tok in sent['toks']],
smweGroups, wmweGroups)
if sent['mwe']!=s:
caveat = ' (may be due to simplification)' if '$1' in sent['mwe'] else ''
print(f'MWE string mismatch{caveat}:', s,sent['mwe'],sent['sent_id'], file=sys.stderr)
if ss_mapper is None:
ss_mapper = lambda ss: ss
sent = {}
for ln in chain(inF, [""]): # Add empty line at the end to avoid skipping the last sent
ln = ln.strip()
if not ln:
if sent:
_unpack_lextags(sent)
_postproc_sent(sent)
yield sent
sent = {}
continue
if ln.startswith('#'):
if ln.startswith('# newdoc ') or ln.startswith('# newpar ') or ln.startswith('# TODO'): continue
m = re.match(r'^# (\w+) = (.*)$', ln)
assert m,ln
k, v = m.group(1), m.group(2)
assert k not in ('toks', 'swes', 'smwes', 'wmwes')
sent[k] = v
else:
if 'toks' not in sent:
sent['toks'] = [] # excludes ellipsis and multiword tokens, so they don't interfere with indexing
sent['etoks'] = [] # ellipsis tokens and multiword tokens only (not to be confused with MWEs)
sent['swes'] = defaultdict(lambda: {'lexlemma': None, 'lexcat': None, 'ss': None, 'ss2': None, 'toknums': []})
sent['smwes'] = defaultdict(lambda: {'lexlemma': None, 'lexcat': None, 'ss': None, 'ss2': None, 'toknums': []})
sent['wmwes'] = defaultdict(lambda: {'lexlemma': None, 'toknums': []})
assert ln.count('\t')==18,ln
cols = ln.split('\t')
conllu_cols = cols[:10]
lex_cols = cols[10:]
# Load CoNLL-U columns
tok = {}
tokNum = conllu_cols[0]
# Special kinds of tokens: ellipsis nodes and multiword tokens.
# These do not receive STREUSLE annotations.
isEllipsis = isMWT = False
if re.match(r'^\d+$', tokNum) is None:
if '.' in tokNum:
isEllipsis = True # ellipsis token (e.g. 24.1), part of enhanced representation
elif '-' in tokNum:
isMWT = True # multiword token (e.g. 10-11), used for clitics
if isEllipsis or isMWT:
part1, part2 = tokNum.split('.' if isEllipsis else '-')
part1 = int(part1)
part2 = int(part2)
tokNum = (part1, part2, tokNum) # token offset is a tuple. include the string for convenience
else:
tokNum = int(tokNum)
tok['#'] = tokNum
tok['word'], tok['lemma'], tok['upos'], tok['xpos'] = conllu_cols[1:5]
assert isMWT or (tok['upos']!='_' and (tok['lemma']!='_' or tok['upos']=='X' and conllu_cols[7]=='goeswith')),tok
if morph_syn:
tok['feats'], tok['head'], tok['deprel'], tok['edeps'] = conllu_cols[5:9]
if tok['head']=='_':
assert isEllipsis or isMWT
tok['head'] = None
else:
tok['head'] = int(tok['head'])
if tok['deprel']=='_':
assert isEllipsis or isMWT
tok['deprel'] = None
if misc:
tok['misc'] = conllu_cols[9]
for nullable_conllu_fld in ('xpos', 'feats', 'edeps', 'misc'):
if nullable_conllu_fld in tok and tok[nullable_conllu_fld]=='_':
tok[nullable_conllu_fld] = None
if not isEllipsis and not isMWT:
# Load STREUSLE-specific columns
#tok['smwe'], tok['lexcat'], tok['lexlemma'], tok['ss'], tok['ss2'], \
# tok['wmwe'], tok['wcat'], tok['wlemma'], tok['lextag'] = lex_cols
# initialize before setting lextag so JSON order will put lextag last
tok['smwe'] = None
tok['wmwe'] = None
# .UDlextag: all but the last should be empty
assert lex_cols[:-1]==['']*8
assert lex_cols[-1]
lt = tok['lextag'] = lex_cols[-1]
# map the supersenses in the lextag
for m in re.finditer(r'\b[a-z]\.[A-Za-z/-]+', tok['lextag']):
lt = lt.replace(m.group(0), ss_mapper(m.group(0)))
for m in re.finditer(r'\b([a-z]\.[A-Za-z/-]+)\|\1\b', lt):
# e.g. p.Locus|p.Locus due to abstraction of p.Goal|p.Locus
lt = lt.replace(m.group(0), m.group(1)) # simplify to p.Locus
tok['lextag'] = lt
parts = lt.split('-', 2)
assert 1<=len(parts)<=3,parts
mweflag = parts[0]
if len(parts)==1:
lexcat = sspart = None
else:
lexcat = parts[1]
if len(parts)==3:
sspart = parts[2]
else:
sspart = None
if sspart:
if '|' in sspart:
ss, ss2 = sspart.split('|')
else:
ss = sspart
if ss.startswith('p.') or ss=='`$': # copy
ss2 = ss
else:
ss2 = None
else:
ss = ss2 = None
tok['_lextag'] = {'mweflag': mweflag, 'lexcat': lexcat, 'ss': ss, 'ss2': ss2}
# these will be moved to the lexical expression level in _unpack_lextags()
if isEllipsis or isMWT:
sent['etoks'].append(tok)
else:
sent['toks'].append(tok)
if lc_tbd>0:
print('Tokens with lexcat TBD:', lc_tbd, file=sys.stderr)
assert False,'PLACEHOLDER LEXCATS ARE DISALLOWED'
if __name__=='__main__':
argparser = ArgumentParser(description=desc)
argparser.add_argument("inF", type=FileType(encoding="utf-8"))
argparser.add_argument("--no-morph-syn", action="store_false", dest="morph_syn")
argparser.add_argument("--no-misc", action="store_false", dest="misc")
argparser.add_argument("--no-validate-pos", action="store_false", dest="validate_pos")
argparser.add_argument("--no-validate-type", action="store_false", dest="validate_type")
args = argparser.parse_args()
print('[')
list_fields = ("toks", "etoks")
dict_fields = ("swes", "smwes", "wmwes")
first = True
for sent in load_sents(**vars(argparser.parse_args())):
# specially format the output
if first:
first = False
else:
print(',')
#print(json.dumps(sent))
sent_copy = dict(sent)
for fld in list_fields+dict_fields:
del sent_copy[fld]
print(json.dumps(sent_copy, indent=1)[:-2], end=',\n')
for fld in list_fields:
print(' ', json.dumps(fld)+':', '[', end='')
if sent[fld]:
print()
print(',\n'.join(' ' + json.dumps(v) for v in sent[fld]))
print(' ],')
else:
print('],')
for fld in dict_fields:
print(' ', json.dumps(fld)+':', '{', end='')
if sent[fld]:
print()
print(',\n'.join(' ' + json.dumps(str(k))+': ' + json.dumps(v) for k,v in sent[fld].items()))
print(' }', end='')
else:
print('}', end='')
print(',' if fld!="wmwes" else '')
print('}', end='')
print(']')