-
Notifications
You must be signed in to change notification settings - Fork 5
/
Copy pathcuncits.py
56 lines (49 loc) · 1.68 KB
/
cuncits.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
#!/usr/bin/python
import sys
import re
curlemma = ''
cuncits = {}
cunlemmas = {}
for w in sys.stdin.readlines():
if( re.search("cunliffe-",w)):
newent =1
if( re.search('textpart" n="',w)):
m=re.search('textpart" n="([^"]+)',w)
# print(m[1])
curlemma = m[1]
curlemma = re.sub("†", "",curlemma)
cunlemmas[curlemma] = 1
print("lemsrc",curlemma,w)
#for cit in re.finditer(r'<bibl n=.([IO][^"]+',w):
for cit in re.finditer('<bibl[ ]+n=.([HIO][^>]+)">',w):
if(curlemma):
newcit = re.sub("Hom\. ","",cit.group(1))
newcit = re.sub("Il\. ","urn:cts:greekLit:tlg0012.tlg001:",newcit)
newcit = re.sub("Od\. ","urn:cts:greekLit:tlg0012.tlg002:",newcit)
# print(newcit,curlemma)
citlem = newcit + "@" + curlemma
cuncits[citlem] = 1
#print(citlem)
tbfile = "/Users/gcrane/github/gAGDT/data/xml/tlg0012.tlg001.perseus-grc1.tb.xml"
with open(tbfile) as f:
for w in f:
curcite = ''
curlemma = ''
if( re.search('cite="([^"]+)"',w)):
m=re.search('cite="([^"]+)"',w)
curcite = m[1]
if( re.search('lemma="([^"]+)"',w)):
m=re.search('lemma="([^"]+)"',w)
curlemma = m[1]
if( re.search('form="([^"]+)"',w)):
m=re.search('form="([^"]+)"',w)
curform = m[1]
if( curlemma == "," or curlemma == "." or curlemma == ";" or curlemma == "·"):
continue
if( curlemma and curlemma not in cunlemmas ):
print(curform,"lemmafail",curcite,curlemma)
if( curcite and curlemma ):
if( curcite+'@'+curlemma in cuncits ):
print(curform,"hit",curcite,curlemma)
else:
print(curform,"citefail",curcite,curlemma)