-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathanalysis.py
72 lines (58 loc) · 1.56 KB
/
analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import re
import string
import Stemmer
STEMMER = Stemmer.Stemmer("english")
def tokenize(text):
"""Split text into individual words."""
return text.split()
def lowercase_filter(tokens):
"""Convert all tokens to lowercase."""
return [token.lower() for token in tokens]
def punctuation_filter(tokens):
"""Remove punctuation from tokens."""
PUNCTUATION = re.compile("[%s]" % re.escape(string.punctuation))
return [PUNCTUATION.sub("", token) for token in tokens]
def stopword_filter(tokens):
"""Remove common stopwords from tokens."""
STOPWORDS = set(
[
"the",
"be",
"to",
"of",
"and",
"a",
"in",
"that",
"have",
"I",
"it",
"for",
"not",
"on",
"with",
"he",
"as",
"you",
"do",
"at",
"this",
"but",
"his",
"by",
"from",
"wikipedia",
]
)
return [token for token in tokens if token not in STOPWORDS]
def stem_filter(tokens):
"""Apply stemming to tokens."""
return STEMMER.stemWords(tokens)
def analyze(text):
"""Analyze text by applying tokenization, filtering, and stemming."""
tokens = tokenize(text)
tokens = lowercase_filter(tokens)
tokens = punctuation_filter(tokens)
tokens = stopword_filter(tokens)
tokens = stem_filter(tokens)
return [token for token in tokens if token]