-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathnormalize_sentences.py
63 lines (44 loc) · 2.08 KB
/
normalize_sentences.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import normalisierung
satzzeichen = ',.?!:;<>()/\{}#"\'´`‚’‘_→[]-~«»'
exlude_zeichen = '*/=→[]."'
def normalize(nlp, sentence_text, filter_satzzeichen=True, filter_exlude_zeichen=True, do_lower_case_first=True, resplit_whitespace=True):
if resplit_whitespace:
sentence_text = sentence_text.replace('\t',' ')
sentence_text = sentence_text.replace('\xa0',' ')
sentence_text = ' '.join(sentence_text.split())
normalized_sentence = normalisierung.text_normalization(sentence_text, tries=8)
#if disable_pipeline:
# text_tokens = nlp(normalized_sentence, disable=["parser", "sentencizer", "lemmatizer"])
#else:
# text_tokens = nlp(normalized_sentence)
text_tokens = nlp(normalized_sentence, disable=["parser", "sentencizer", "lemmatizer"])
# NE PROPN proper noun
# NNE PROPN proper noun
# NN NOUN noun, singular or mass
lower_case_first = False
if len(text_tokens) == 0:
return ''
try:
if text_tokens[0].tag_ not in ["NE", "NNE", "NN"]:
lower_case_first = True
except:
print("Warning could not retrieve tag!")
if filter_satzzeichen:
tokens = [token.text for token in text_tokens if token.text not in satzzeichen] #if (token.text != '\n' and token.text != ' ')]
tokens = [token[:-1] if token[-1] == '-' else token for token in tokens]
tokens = [token[1:] if token[0] == '-' else token for token in tokens]
else:
tokens = [token.text for token in text_tokens]
#if len(tokens) < min_token_len:
# lines_dropped += 1
# continue
rejoined_text = ' '.join(tokens).strip()
#if filter_exlude_zeichen and any(character in exlude_zeichen for character in rejoined_text):
# lines_dropped += 1
# continue
# replace all double white space
while ' ' in rejoined_text:
rejoined_text = rejoined_text.replace(' ',' ')
if do_lower_case_first and lower_case_first:
rejoined_text = rejoined_text[0].lower() + rejoined_text[1:]
return rejoined_text