-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtopicmodel.py
executable file
·67 lines (54 loc) · 1.65 KB
/
topicmodel.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
#!/usr/bin/env python
import os
import re
import json
import codecs
import string
import argparse
from stopwords import stopwords
from glob import iglob
from gensim import corpora, models
def articles():
for filename in iglob("extractedArticles/**/*.txt", recursive=True):
w = words(filename)
if 'freedom' in w:
yield w
def words(filename):
text = codecs.open(filename, 'r', 'utf8').read().lower()
return [w for w in re.split(r'\W+', text) if w]
def remove_stopwords(sources):
def f():
for doc in sources():
new_doc = []
for word in doc:
if len(word) > 3 and word.lower() not in stopwords:
new_doc.append(word)
yield new_doc
return f
def get_corpus(dictionary):
def ids():
for doc in articles():
yield dictionary.doc2bow(doc)
path = "corpus.mm"
corpora.MmCorpus.serialize(path, ids())
corpus = corpora.MmCorpus(path)
return corpus
def topics(sources=articles, num_words=5, num_topics=5, passes=10, iterations=50, ignore=False):
if ignore:
sources = remove_stopwords(sources)
dictionary = corpora.Dictionary(sources())
corpus = get_corpus(dictionary)
lda = models.ldamodel.LdaModel(
corpus,
id2word=dictionary,
num_topics=num_topics,
passes=passes,
iterations=iterations
)
topics = lda.top_topics(corpus, topn=num_words)
num = 0
for topic in topics:
num += 1
print("%s. %s" % (num, ', '.join([t[1] for t in topic[0]])))
if __name__ == "__main__":
print(topics(articles, num_topics=20, ignore=True, passes=20))