forked from Hmknipp/Women-of-Coal-Revisited
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtopicmodels.R
66 lines (45 loc) · 1.47 KB
/
topicmodels.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
library(tidytext)
library(topicmodels)
library(tidyverse)
library(tm)
library(reshape2)
library(ggplot2)
library(dplyr)
library(tidyr)
text_dir <- "/txt"
# Create corpus
text<- Corpus(DirSource(directory = text_dir, encoding = "UTF-8"))
corpus_womenofcoal <- Corpus(VectorSource(text))
mystopwords <- readLines("eng.txt")
# preprocessing
corpus_womenofcoal <- tm_map(corpus_womenofcoal, content_transformer(tolower))
corpus_womenofcoal <- tm_map(corpus_womenofcoal, removePunctuation)
corpus_womenofcoal <- tm_map(corpus_womenofcoal, removeNumbers)
corpus_womenofcoal <- tm_map(corpus_womenofcoal, removeWords, mystopwords)
corpus_womenofcoal <- tm_map(corpus_womenofcoal, stripWhitespace)
# Convert the corpus to a Document-Term Matrix (DTM)
dtm <- DocumentTermMatrix(corpus_womenofcoal)
# Create the LDA model
womenofcoalLDA <- LDA(dtm, k = 5, control = list(seed = 1234))
#lda
num_topics <- 5
lda_model <- LDA(dtm, k = 15)
#top terms
top_terms <- terms(lda_model, 15)
for (i in 1:num_topics) {
cat("Topic", i, ":", paste(top_terms[i, ], collapse = ", "), "\n")
}
woc_topics <- tidy(womenofcoalLDA, matrix= "beta")
woc_topics
#intial LDA top terms and visualization
woc_top_terms <- woc_topics %>%
group_by(topic) %>%
top_n(10, beta) %>%
ungroup() %>%
arrange(topic, beta)
woc_top_terms %>%
mutate (term = reorder(term,beta)) %>%
ggplot(aes(term, beta, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
coord_flip()