-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathmake_lex.py
86 lines (75 loc) · 4.43 KB
/
make_lex.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
# -*- coding: utf-8 -*-
from __future__ import print_function
import io
import argparse
import re
from collections import defaultdict
# words containing these strings are ignored
wordfilter = ['{','}','[',']','PAGENAME','<','>','&','|','/',' ',':', '*']
def clean_word(word):
rules = [(',',''),('?',''),('!',''),('.',''),(u'®','')]
for rule in rules:
word = word.replace(rule[0],rule[1])
return word
def remove_stress(phonemes):
return phonemes.replace(u'ˈ','').replace(u'ˌ','')
def process(wikifile, outfile, gen_testset, do_remove_stress, lang):
lang_count = defaultdict(int)
written_out = 0
with io.open(wikifile,'r',encoding='utf-8') as wiki_in:
with io.open(outfile,'w',encoding='utf-8') as wiki_out:
found_word=False
for line in wiki_in:
if line[-1] == '\n':
line = line[:-1]
line = line.strip()
# start segment for the dictionary entry
if lang == 'de':
match = re.match('.*==(.*)\(\{\{Sprache\|Deutsch\}\}\) ==', line.strip())
elif lang == 'en':
match = re.match('<title>(.*)<\/title>', line.strip())
if ('==English==' in line):
found_english=True
if match:
word = match.group(1)
word = word.strip()
#print(word)
if not any((elem in word for elem in wordfilter)):
if len(word) > 20:
print(word)
if len(word) > 1 and not word[-1]=='-' and not word[0]=='-':
word_cleaned = clean_word(word)
found_word=True
# regex to identify IPA entry
if lang=='de':
match = re.match('^\:\{\{IPA\}\}.{1,3}\{\{Lautschrift\|([^\}]+)\}\}.*', line.strip())
elif lang=='en':
#entries are various of this line: * {{a|US}} {{IPA|/ə.bɹʌpt/|/aˈbɹʌpt/|lang=en}}
match = None
if 'lang=en' in line and 'IPA' in line and not 'RP' in line and not 'UK' in line:
match = re.match('[^\/]*\/([^\/]*)\/[^\/]*', line.strip())
if found_word and match:
phonemes = match.group(1)
# we identified the word for entry and could parse the phoneme entry:
if phonemes is not None and found_word:
if (not u'…' in phonemes) and (not '...' in phonemes):
if remove_stress:
phonemes = remove_stress(phonemes)
phonemes = phonemes.replace(' ','').replace('.','').replace('(','').replace(')','').replace('[','').replace(']','')
wiki_out.write(word_cleaned+u' '+u' '.join(phonemes)+'\n')
written_out += 1
if (written_out%1000 == 0):
print('written: ', written_out, 'entries.')
# If we see this somewhere in our input, we are already past the phoneme entry
if '=See also=' in line or '=Translations=' in line or '</page>' in line or '{{Beispiele}}' in line or '{{Referenzen}}' in line or '{{Quellen}}' in line:
found_word=False
found_english=False
if __name__ == '__main__':
parser = argparse.ArgumentParser(description='Process a wiktionary dictionary in xml format and make a text ipa lexicon. Currently for German and English wiktionary XMLs.')
parser.add_argument('-f', '--file', dest='file', help='process this xml wiktionary lexicon file', type=str, default='dewiktionary-latest-pages-articles-multistream.xml')
parser.add_argument('-o', '--outfile', dest='outfile', help='lexicon out file', type=str, default='de_ipa_lexicon.txt')
parser.add_argument('-t', '--gen-testset', dest='gen_testset', help='generate a testset', action='store_true', default=False)
parser.add_argument('-r', '--remove-stress', dest='remove_stress', help='remove stress markers', action='store_true', default=False)
parser.add_argument('-l', '--lang', dest='lang', help='language specific parsing', default = 'de')
args = parser.parse_args()
process(args.file, args.outfile, args.gen_testset, args.remove_stress, args.lang)