-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathplot_wordfrequencies.R
155 lines (116 loc) · 6.33 KB
/
plot_wordfrequencies.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
# Plots with word frequencies in corpus
# Create DTM, but remove terms which occur in less than 0,1% of all documents
DTM <- corpus_tokens %>%
dfm() %>%
dfm_trim(min_docfreq = 0.001, max_docfreq = 0.99, docfreq_type = "prop")
# have a look at the number of documents and terms in the matrix
dim(DTM)
# due to vocabulary pruning, we have empty rows in our DTM
# LDA does not like this. So we remove those docs from the
# DTM and the metadata
sel_idx <- rowSums(DTM) > 0
DTM <- DTM[sel_idx, ]
textdata_new <- textdata_new[sel_idx, ]
# delete selected terms from DTM
# terms, which contains "â" or"œ" or "@" or "."
terms_to_delete <- c()
for (i in 1:dim(DTM)[2]){
if (grepl("â",colnames(DTM)[i], fixed = TRUE) | grepl("œ",colnames(DTM)[i], fixed = TRUE) | grepl(".",colnames(DTM)[i], fixed = TRUE) | grepl("@",colnames(DTM)[i], fixed = TRUE)) {
terms_to_delete <- append(terms_to_delete, colnames(DTM)[i])
}
}
DTM <- DTM[,!(colnames(DTM) %in% terms_to_delete)]
# Show most used terms
topfeatures(DTM, 100, decreasing = TRUE)
#########################################################################
# Plot wordcloud with most frequent words in corpus
require(wordcloud2)
top20terms <- topfeatures(DTM, 100, decreasing = TRUE)[1:30]
words <- names(top20terms)
# visualize the terms as wordcloud
wordcloud2(data.frame(words, top20terms), shuffle = FALSE)
#########################################################################
# Plot frequency of selected words over time
# check if terms exists in DTM before processing
DTM[,"protest"]
# terms:
# high word frequencies (> 5000): government, lockdown, hospital, infection, test, protest
# Impfung, Maßnahmen, Soziales/Soziale Interaktion/wie Menschen damit umgegangen sind
# Nebenerscheinungen: Hamsterkäufe, Demos,
# Maßnahmen: mask, distance, test, vaccine/vaccination/vaccinate, wash
# Allgemeines: government, lockdown, demonstrate/demonstration/resistance, science, who
# Krankheit: infection, symptom, test, antigen, pcr, proof, rapid, certificate, quarantine,
# center, recover, result, hospital, ventilator, intensive, disease, mask, spread
# different vaccines: biontech, vaccine/vaccination/vaccinate, moderna, pfizer, oxford, astrazeneca, booster
# different virus names: Covid, Covid-19, Coronavirus, Corona, Virus, variant, pandemic
# variants of virus: alpha (uk), beta (africa), gamma (brazil), delta (india), omicron, variant
# politics: johnson, trump, biden, putin, jinping, ghebreyesus (who), merkel, who
# country: china, america, uk, usa, europe, germany, india, africa, australia, russia, italy
pos_colors <- c('black','red','blue','green', "magenta", "cyan", "orange","salmon4", "lightskyblue1", "gray")
terms_to_observe <- c("astrazeneca", "pfizer","moderna", "sputnik", "janssen", "novavax", "booster")
term_colors <- pos_colors[1:length(terms_to_observe)]
DTM_reduced <- as.matrix(DTM[,terms_to_observe])
# Monat des Artikels abspeichern für Aggregation
textdata_new$month <- substr(textdata_new$date,0,7)
counts_per_month <- aggregate(DTM_reduced, by = list(months = textdata_new$month),sum)
# give x and y values beautiful names
months <- counts_per_month$months
frequencies <- counts_per_month[, terms_to_observe]
rownames(frequencies) <- months
#plot multiple frequencies
matplot(frequencies, type = "l", col=term_colors, ylab = "word frequencies", font.lab=2, xaxt='n',cex.lab=1.5 , cex.axis=1.2)
axis(side=1,at=1:nrow(frequencies),tick=1:28,labels=months,las=2, cex.axis=1.2)
# add legend to the plot
l <- length(terms_to_observe)
legend("topleft",legend = terms_to_observe, col=term_colors, text.col=term_colors, lty = 1:l, cex = 1.2)
#dev.off()
#########################################################################
# Plot (relative) frequency of selected words in a piechart
library(RColorBrewer)
# virus names:
words <- c("coronavirus", "covid-19", "sars-cov-2", "covid", "pandemic", "virus")
# variants of virus:
#words <- c("alpha", "beta", "gamma", "delta", "omicron")
# countries:
#words <- c("china", "uk", "europe", "america", "australia", "india", "russia", "usa")
# politicians:
# words <- c("johnson", "biden", "trump", "putin", "jinping", "merkel", "macron")
# vaccines:
#words <- c("astrazeneca", "oxford", "biontech", "pfizer", "moderna", "janssen", "novavax", "sputnik")
word_cnt <- numeric(length(words))
for (i in 1:nrow(textdata_new)){
word_counts <- str_count(tolower(textdata_new$text[i]), words)
word_counts_title <- str_count(tolower(textdata_new$title[i]), words)
word_cnt <- word_cnt + (word_counts + word_counts_title)
}
# list with possible colors:
pos_colors <- c('red','gray','green', "magenta", "orange", "cyan", "blue", "salmon4", "lightskyblue1", "gray")
# select colors according to number of selected words
term_colors <- pos_colors[1:length(words)]
# alternative for color selection:
nb.cols <- length(words)
# extend any list of colors
# available color palettes: http://www.sthda.com/english/wiki/ggplot2-colors-how-to-change-colors-automatically-and-manually
mycolors <- colorRampPalette(brewer.pal(8, "Set2"))(nb.cols)
# calculate relative word frequencies
sum_covid_words <- sum(word_cnt)
word_cnt_percentage <- word_cnt/sum_covid_words
# save absolute and relative word frequencies in dataframe
covid_words <- data.frame(words, word_cnt, word_cnt_percentage)
# Processing of data for the creation of the pie chart
piedata <- covid_words %>%
dplyr::arrange(desc(words)) %>%
dplyr::mutate(Position = (cumsum(word_cnt_percentage)-0.5*word_cnt_percentage))
# Plot piechart
ggplot(piedata, aes("", word_cnt_percentage, fill = words)) +
geom_bar(stat = "identity", width = 1, color = "white") +
coord_polar("y", start = 0) +
scale_fill_manual(values=mycolors) +
theme_void() +
geom_text(aes(y=Position, label = round(word_cnt_percentage*100,2)), color = "black", size = 8) + labs(fill = "Covid words:") + theme(
legend.title = element_text(size = 16, face = "bold"),
legend.text = element_text(size = 14)
)
# alternative: plot word frequencies as barplot
#ggplot(covid_words, aes(words, word_cnt)) + geom_bar(stat="identity")+ theme(axis.text.x = element_text(angle = 0, hjust = 1, vjust = 0.2)) +
#scale_y_continuous(name = "Number of articles") + labs(title = "Frequency of Covid-words in Articles")