-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathlatent_semantic_analysis.py
61 lines (49 loc) · 1.94 KB
/
latent_semantic_analysis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
import re
from time import time
from sklearn.decomposition import TruncatedSVD
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from typing import List
from util import data_io
from wordcloud_methods import word_cloud_pdf
def print_top_words(model, feature_names, n_top_words):
for topic_idx, topic in enumerate(model.components_):
message = "Topic #%d: " % topic_idx
message += " ".join(
[feature_names[i] for i in topic.argsort()[: -n_top_words - 1 : -1]]
)
print(message)
def get_nrams(tokens:List[str], min_n=1, max_n=5):
return ['_'.join(tokens[k:k + ngs]) for ngs in range(min_n, max_n + 1) for k in range(len(tokens) - ngs)]
def regex_tokenizer(text, pattern=r"(?u)\b\w\w+\b"):# pattern stolen from scikit-learn
return [m.group() for m in re.finditer(pattern, text)]
def text_to_bow(text):
return get_nrams(regex_tokenizer(text),1,3)
if __name__ == "__main__":
file = "BverfG.jsonl.gz"
# print(Counter(k for d in data_io.read_jsonl(file) for k in d.keys()))
p = "Orientierungssatz"
data = [d[p] for d in data_io.read_jsonl(file) if p in d]
texts = [" ".join(l) for l in data]
print("%d texts" % len(texts))
vectorizer = TfidfVectorizer(
min_df=3,
tokenizer=lambda x: x,
preprocessor=lambda x: x,
lowercase=False,
sublinear_tf=False,
max_features=20000,
max_df=0.75,
)
tf = vectorizer.fit_transform([text_to_bow(text) for text in texts])
pca = TruncatedSVD(n_components=20, random_state=42)
t0 = time()
X = pca.fit_transform(tf.toarray())
print("LDA took: %0.2f" % (time() - t0))
feature_names = vectorizer.get_feature_names()
print_top_words(pca, feature_names, 20)
n_top_words = 40
l2f2w = {
str(k): {feature_names[i]: c[i] for i in c.argsort()[: -n_top_words - 1 : -1]}
for k, c in enumerate(pca.components_)
}
word_cloud_pdf(l2f2w)