-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathto-CG.py
executable file
·120 lines (112 loc) · 3.05 KB
/
to-CG.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
#!/usr/bin/env python3
import sys
import unicodedata
import utils
utils.load_volume(sys.argv[1], globals())
def norm(s):
return unicodedata.normalize('NFC', s).replace('שׁ', 'שׁ').replace('שׂ', 'שׂ')
def surf(w):
ws = norm(T.text(w).strip(F.trailer_utf8.v(w))) or 'blah'
prs = F.prs.v(w)
if prs and prs not in ['absent', 'n/a']:
cons = {
'W': ['ו', 'ה'],
'J': ['י'],
'K': ['כ', 'ך'],
'H': ['ה', 'נ', 'ת'],
'M': ['מ', 'ם'],
'N': ['נ', 'ן'],
# ignore '='
}
#print(ws, prs, file=sys.stderr)
i = len(ws)-1
p = len(prs)-1
while p >= 0:
if prs[p] not in cons:
p -= 1
elif ws[i] in cons[prs[p]]:
p -= 1
i -= 1
else:
i -= 1
continue
return ws[:i+1], ws[i+1:]
else:
return ws, ''
def get(w, f, p=''):
v = F.__getattribute__(f).v(w)
if not v or v in ['NA', 'unknown', 'none', '>']:
return ''
else:
return f'<{p}{v}>'
def clause_parent(c):
l = list(E.mother.f(c))
if len(l) != 1:
return ''
p = l[0]
if F.otype.v(p) == 'phrase':
return f'<par:ph{p}>'
elif F.otype.v(p) == 'clause':
return f'<par:c{p}>'
elif F.otype.v(p) == 'word':
return f'<par:w{p}>'
else:
return ''
feats = ['sp', 'ls', 'vt', 'vs', 'typ', 'function', 'domain', 'gn', 'nametype', 'nu', 'ps', 'st', 'det']
prev_p = 0
prev_c = 0
prev_s = 0
prev_v = 0
for w in F.otype.s('word'):
p = L.u(w, otype="phrase")[0]
c = L.u(w, otype="clause")[0]
s = L.u(w, otype="sentence")[0]
v = L.u(w, otype="verse")[0]
if s != prev_s and v != prev_v:
print('^svb/svb<svb>$', end=' ')
elif s != prev_s:
print('^sb/sb<sb>$', end=' ')
prev_s = s
prev_v = v
if c != prev_c:
print('^cb/cb<cb>$', end=' ')
prev_c = c
if p != prev_p:
print('^pb/pb<pb>$', end=' ')
prev_p = p
phr_ft = f'<ph{p}><c{c}><s{s}><{T.bookName(w)}>'
for f in feats:
phr_ft += get(p, f)
phr_ft += get(c, f)
phr_ft += get(c, 'rela', 'rela:')
phr_ft += clause_parent(c)
q_depth = (F.txt.v(c) or '').count('Q')
phr_ft += f'<txt:{q_depth}>'
srf, psrf = surf(w)
lem = norm(F.lex_utf8.v(w))
tags = ''
for f in feats:
tags += get(w, f)
tags += get(w, 'uvf', 'uvf:')
tags += phr_ft
tags += f'<w{w}>'
lu = ''
if ' ' in lem:
for i, l in enumerate(lem.split()):
lu += f'^{srf}/{l}{tags}<wp{i+1}>$'
if '־' in srf and i == 0:
lu += '^־/־<punct>$'
else:
lu = f'^{srf}/{lem}{tags}$'
prn = ''
for f in ['prs_ps', 'prs_gn', 'prs_nu']:
prn += get(w, f)
if prn:
lu += f'^{psrf}/prn<prn>{prn}<w{w}p>{phr_ft}$'
for c in F.trailer_utf8.v(w):
if c == ' ':
lu += c
else:
lu += f'^{c}/{c}<punct>$\n'
print(lu, end='')
print('')