-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_profile.py
77 lines (65 loc) · 2.07 KB
/
run_profile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
# Import Python standard libraries
import argparse
import csv
from tabulate import tabulate
import unicodedata
# Import MPI-SHH libraries
from segments import Profile, Tokenizer
def my_tokenizer(form, tokenizer):
form = "^" + form + "$"
form = unicodedata.normalize("NFC", form)
return tokenizer(form, column="IPA")
def main(args):
# Initiate tokenizer and profile
profile = Profile.from_file(args.profile)
tokenizer = Tokenizer(profile=profile)
# Open file and check items
errors = []
with open(args.wordlist) as handler:
reader = csv.DictReader(handler, delimiter="\t")
for count, row in enumerate(reader):
segments = my_tokenizer(row[args.form], tokenizer)
reference = row[args.segments]
if segments != reference:
errors.append([row["ID"], row[args.form], segments, reference])
if args.l:
if count > args.l:
break
# Output
print(tabulate(errors, headers=["ID", "Form", "Result", "Reference"]))
print(
"Errors: %i/%i (%.2f%%)"
% (len(errors), count + 1, (len(errors) / (count + 1)) * 100)
)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Build dataset.")
parser.add_argument(
"wordlist",
type=str,
help="Orthographic profile file (default: `orthography.tsv`).",
)
parser.add_argument(
"-l",
type=int,
help="Instructs the script to consider only the top `l` lines (default: all)",
)
parser.add_argument(
"--profile",
type=str,
default="orthography.tsv",
help="Orthographic profile file (default: `orthography.tsv`).",
)
parser.add_argument(
"--form",
type=str,
default="Form",
help="Column for the form field (default: `Form`).",
)
parser.add_argument(
"--segments",
type=str,
default="Segments",
help="Column for the segments field (default: `Segments`).",
)
ARGS = parser.parse_args()
main(ARGS)