forked from alexstorer/twittersauce
-
Notifications
You must be signed in to change notification settings - Fork 0
/
tweetanalysis.R
64 lines (45 loc) · 1.96 KB
/
tweetanalysis.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
# This is an exploration of topic modeling
# Some graphing components
#source("http://bioconductor.org/biocLite.R")
#biocLite("Rgraphviz")
# First, let's load up this pile of things.
Sys.setenv(NOAWT=TRUE)
# This is a workaround for Macs
library(tm)
library(RWeka)
library(Rgraphviz)
library(slam)
readvanilla <- function (elem, language, id)
{
# ideally, add in metadata for each document as well
doc <- PlainTextDocument(elem$content, id = id, language = language)
}
# Load in the tweets
tweets <- read.csv('/Users/astorer/Work/presentations/twitter/finalcode/twittersauce/example_tweets.csv')
# Read just the tweet text into the corpus
corpus <- Corpus(VectorSource(tweets$text),readerControl=list(reader = readvanilla))
# This should turn off locale-specific sorting
Sys.setlocale("LC_COLLATE", "C")
# Make our document-term matrix
dtm <- DocumentTermMatrix(corpus, control = list(stemming = FALSE,
tokenize = WordTokenizer,
stopwords = TRUE,
minWordLength = 3,
removeNumbers = TRUE,
removePunctuation = TRUE))
# How many documents and terms do we have?
dim(dtm)
# What are the frequent terms?
ft <- findFreqTerms(dtm, lowfreq = 150, highfreq = 5000)
# We can get the most frequent terms
sorted_terms <- sort(col_sums(dtm),decreasing=T)
# We can plot them
barplot(sorted_terms[1:20],las=2)
# Here are the 100 most popular terms in alphabetical order:
sort(names(sorted_terms[1:100]))
# We can also plot the terms with lines indicating if they correlate
plot(dtm, terms = names(sorted_terms[3:20]), corThreshold = 0.075)
# Here are terms that correlate with debt
findAssocs(dtm,term="debt",0.3)
# Here is a plot of some debt terms and how they are related
plot(dtm, terms = names(findAssocs(dtm,term="debt",0.2)), corThreshold = 0.30)