-
Notifications
You must be signed in to change notification settings - Fork 10
/
Copy pathrestoredocids.py
executable file
·48 lines (44 loc) · 1.33 KB
/
restoredocids.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# Copyright (C) 2019 Serge Sharoff
# This program is free software under GPL 3, see http://www.gnu.org/licenses/
# The tool restores the document ids lost through udpipe processing
# # text = __id__LABEL Versija.
# 1 __id__LABEL __id__LABEL X__...
# 2 Versija versija NOUN Ncnsnn Case=Nom|Gender=Neut|Number=Sing
#
# to
# # newdoc id = __id__LABEL
# text = Versija.
# 1 Versija versija NOUN Ncnsnn Case=Nom|Gender=Neut|Number=Sing
import sys, re
f=open(sys.argv[1]) if len(sys.argv)>1 else sys.stdin
curid=''
curtext=''
recount=False
idptn='__id__'
textptn='# text = '
fullptn=textptn+idptn
for l in f:
if re.match(r'[0-9-]+\t', l):
w=l.split('\t')[1]
if w.startswith(idptn): # 1 __id__LABEL __id__LABEL X__...
recount=True
count=0
print('\n# newdoc id = '+w)
p=curtext.find(w)
if p>0:
curtext=curtext[p+len(w):]
sys.stdout.write('# text = '+curtext)
elif recount: # 2 Versija versija NOUN
count+=1
l=re.sub(r'^[0-9-]+\t', str(count)+'\t',l)
sys.stdout.write(l)
else:
sys.stdout.write(l)
elif l.startswith(fullptn): # we'll save # text for the future
curid=l[len(textptn):].split()[0]
curtext=l[len(textptn):]
elif l.startswith(textptn): # normal text description
sys.stdout.write('\n'+l)
recount=False