-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathR_Google_news_scraping
106 lines (83 loc) · 4.79 KB
/
R_Google_news_scraping
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
# R scripts to scrape Google news headlines, created on 07/01/2019
# Clean the workspace -----------------------------------------------------
rm(list = ls()); cat("\014")
# Load the packages -------------------------------------------------------
library("rvest")
library("tm")
library("wordcloud")
url<-"https://news.google.com/search?q=Ageas%20Insurance%20UK&hl=en-GB&gl=GB&ceid=GB%3Aen"
# webpage <- read_html(url)
# # node <- html_nodes(webpage, xpath = '//*[@class="ZulkBc qNiaOd"]') # class="mEaVNd", class="r dO0Ag" , class="l lLrAF"
# node <- html_nodes(webpage, xpath = '//*[@class="ZulkBc qNiaOd"]') #".r", ".g"
# # the second argument to the html_nodes() are the nodes, you can find the nodes with the Selectorgadget, it's an extension to add to the Google Chrome
text <- url %>%
read_html() %>%
html_nodes(xpath = '//*[@class="ZulkBc qNiaOd"]') %>% #class="mEaVNd", class="ZulkBc qNiaOd"
html_text()
class(text) # the text object is a character vector, it needs to be converted as corpus (a collection of written text), the functions VectorSource() and Corpus from the package "tm" can be used:
## Loading the required package: NLP
## Load the data as Corpus
docs <- Corpus(VectorSource(text))
## inspect the content: if you want to see the content
inspect(docs)
# Text transformation and cleaning -----------------------------------------------------
# transformation is performed using the tm_map() function to replace special character from the text.
toSpace <- content_transformer(function (x , pattern ) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "'")
docs <- tm_map(docs, toSpace, "/")
docs <- tm_map(docs, toSpace, "@")
docs <- tm_map(docs, toSpace, "\\|")
# the tm_map func is used to remove unnecessary white space, to convert the text to lower case, to remove common stopwords such as "the", "we".
# For "stopwords", supoported languages are danish, dutch, english, finish, german, hungarian, italian, norwegian, portuguese, russian, spanish, and swedish. Language names are case sensitive.
# Cleaning the text -------------------------------------------------------
# Convert the text to lower case
docs <- tm_map(docs, content_transformer(tolower))
# Remove numbers
docs <- tm_map(docs, removeNumbers)
# Remove english common stopwords
docs <- tm_map(docs, removeWords, stopwords("english"))
# Remove your own stop word
# specify your stopwords as a character vector
docs <- tm_map(docs, removeWords, c("ageas","insurance","year","latest","insurers","insurer","today","back","says","take","view","use","will","now","like","eur","i","l","can","table","said","even","ltd","way","group","company","well","according","firm","plc","know","may","told","readers","kong","don","number","–","industry"))
# Remove punctuations
docs <- tm_map(docs, removePunctuation)
# Eliminate extra white spaces
docs <- tm_map(docs, stripWhitespace)
# Text stemming
# docs <- tm_map(docs, stemDocument)
### generate the word cloud and frequency plot
# before generating the cloud, we build the term-document matrix. column names are the words and row names are documents. the function TermDocumentMatrix() from the "tm" package is used.
# Build a term-document matrix --------------------------------------------
dtm <- TermDocumentMatrix(docs)
m <- as.matrix(dtm)
v <- sort(rowSums(m),decreasing=TRUE)
d <- data.frame(word = names(v),freq=v)
head(d, 10)
# require(berryFunctions)
# insert_loc=15
# insert_df=as.data.frame(t(c(word="[email protected]",freq=d[insert_loc,"freq"]+1)),row.names = "Yongchao")
# insertRows(d, seq(insert_loc,insert_loc+nrow(insert_df)-1), new=insert_df)
# Generate the Word cloud -------------------------------------------------
# the cloud can be generated using the func wordcloud() from the package "wordcloud"
set.seed(1234)
X11();
par(mar=c(0,0,0,0))
wordcloud(words = d$word, freq = d$freq, min.freq = 1,
max.words=200, random.order=FALSE, rot.per=0.35,
colors=brewer.pal(8, "Dark2"))
# ## without taking into account the insurance word
# # after removing the word "insurance":
# X11();
# wordcloud(words = d$word[-1], freq = d$freq[-1], min.freq = 1,
# max.words=200, random.order=FALSE, rot.per=0.35,
# colors=brewer.pal(8, "Dark2"))
# Explore frequent terms and their associations ---------------------------
findFreqTerms(dtm, lowfreq = 10)
# to find words that occur at least 4 times
findAssocs(dtm, terms = "insurance", corlimit = 0.3) # analyze the association between frequent terms (i.e. terms which correlate) using the findAssocs() function. Here we identify which words are associated with "insurance"
# Plot word frequencies ---------------------------------------------------
X11();
par(mar=c(6,5,2,0))
barplot(d[1:10,]$freq, las = 2, names.arg = d[1:10,]$word,
col ="lightsalmon", main ="Most frequent words",
ylab = "Word frequencies")