-
Notifications
You must be signed in to change notification settings - Fork 17
/
Copy pathParser.py
67 lines (54 loc) · 1.93 KB
/
Parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
import numpy as np
from spacy.lang.en import English
import spacy
def convert(fname, threshold=.9):
pos_tagger = English() # part-of-speech tagger
original_email = _read_email(fname)
sentences = _corpus2sentences(original_email) # convert to sentences
# iterate through sentence, write to a new file if not signature block
fn = fname.split(".")
new_fname = fn[0] + "_clean." + fn[1]
_generate_text(sentences, new_fname)
def _read_email(fname):
with open(fname, 'r') as email:
text = email.read()
return text
def _corpus2sentences(corpus):
"""split corpus into a list of sentences.
"""
return corpus.strip().split('\n')
def _generate_text(sentences, fname, threshold=0.9):
"""iterate through sentences. if the sentence is not a signature block,
write to file.
if probability(signature block) > threshold, then it is a signature block.
Parameters
----------
sentence : str
Represents line in email block.
POS_parser: obj
Spacy English object used to tag parts-of-speech. Will explore using
other POS taggers like NLTK's.
fname : str
Represents fname of new corpus, excluding signature block.
threshold: float
Lower thresholds will result in more false positives.
"""
tagger = spacy.load('en_core_web_sm')
with open(fname, "w") as new_file:
for sentence in sentences:
if _prob_block(sentence, tagger) < threshold:
new_file.write(sentence)
def _prob_block(sentence, pos_tagger):
"""Calculate probability that a sentence is an email block.
https://spacy.io/usage/linguistic-features
Parameters
----------
sentence : str
Line in email block.
Returns
-------
probability(signature block | line)
"""
doc = pos_tagger(sentence)
verb_count = np.sum([token.pos_ != "VERB" for token in doc])
return float(verb_count) / len(doc)