index.Rmd

---
title: "A Collexeme Analysis in a Mini-PTT Corpus"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
<br>
<CENTER>古貿昌</CENTER>
<CENTER>2021/1/7</CENTER>
<br>

```{r message=FALSE, results='hold', warning=FALSE}

library(tidyverse)
library(rvest)
library(jiebaR)
library(future.apply)
library(tidytext)
library(quanteda)

```


## 爬取資料
```{r eval=FALSE}

scrapePTT <- function(ptt_url, num_index_page) {
  
  # create session
  session <- submit_form(
    session = html_session(ptt_url),
    form = html_session(ptt_url) %>%
      html_node('form') %>%
      html_form,
    submit = 'yes'
  )

  # find latest page
  page.latest <- session %>%
    html_nodes('a') %>% # extract all <a> elements
    html_attr('href') %>%  # extract the attributes `href`
    str_subset('index[0-9]{2,}\\.html') %>% # find the `href` with the index number
    str_extract('[0-9]+') %>% # extract the number
    as.numeric()
  
  # define function to extract article links
  extract_art_links <- function(index_page) {
    links.article <- session %>%
      jump_to(index_page) %>%
      html_nodes('a') %>%
      html_attr('href') %>%
      str_subset('[A-z]\\.[0-9]+\\.[A-z]\\.[A-z0-9]+\\.html') %>%
      str_c('https://www.ptt.cc', .)
    
    return(links.article)
  }
  
  # define function to extract article table
  extract_art_df <- function(link) {
    temp.html <- session %>%
      jump_to(link) # link to the www
    # article header
    article.header <- temp.html %>%
      html_nodes('span.article-meta-value') %>% # meta info regarding the article
      html_text()
    
    # article meta
    article.author <- article.header[1] %>% str_extract('^[A-z0-9_]+') # author
    board.name <- article.header[2] # board name
    article.title <- article.header[3] # title
    
    # article content
    article.content <- temp.html %>%
      html_nodes( # article body
        xpath = '//div[@id="main-content"]/node()[not(self::div|self::span[@class="f2"])]'
      ) %>%
      html_text(trim = TRUE) %>%
      str_c(collapse = '')
    
    # create article table 
    article.df <- tibble(
      看板 = board.name,
      作者 = article.author,
      標題 = article.title,
      內容 = article.content,
    )
    return(article.df)
  }
  
  # merge altogether
  index_pages <- vector()
  links <- vector()
  for (i in (page.latest-(num_index_page-1)):page.latest) {
    index_pages <- c(index_pages, paste0(ptt_url, '/index', i, '.html'))
  }
    
  for (j in seq_along(index_pages)) {
    links <- c(links, extract_art_links(index_pages[j]))
  }
  
  df <- links %>%
      map(extract_art_df) %>%
      bind_rows()
  return(df)
}

ptt.url <- 'https://www.ptt.cc/bbs/Gossiping'

plan(multiprocess(workers = 8, gc = T))
PTT_df <- scrapePTT(ptt.url, 300) %>%
  mutate(編號 = row_number()) %>%
  select(看板, 編號, everything())

# save
write_csv(PTT_df, 'PTT_GSP.csv')

```


## 讀取檔案
```{r message=FALSE, results='hold', warning=FALSE}

HP_df <- read_csv('PTT_HP.csv') %>%
  filter(!str_detect(標題, 'Re')) %>%
  mutate(編號 = row_number())

BG_df <- read_csv('PTT_BG.csv') %>%
  filter(!str_detect(標題, 'Re')) %>%
  mutate(編號 = row_number())

GSP_df <- read_csv('PTT_GSP.csv') %>%
  mutate(看板 = '八卦')

PTT_df <- rbind(HP_df, BG_df, GSP_df)

head(HP_df)
head(BG_df)
head(GSP_df)

```

## 數據統計
```{r message=FALSE, results='hold', warning=FALSE}

boards <- c('政黑', '男女', '八卦')

HP_npost <- nrow(HP_df)
BG_npost <- nrow(BG_df)
GSP_npost <- nrow(GSP_df)

HP_nchar <- HP_df$內容 %>% nchar %>% sum(na.rm = T)
BG_nchar <- BG_df$內容 %>% nchar %>% sum(na.rm = T)
GSP_nchar <- GSP_df$內容 %>% nchar %>% sum(na.rm = T)

tidy_HP <- HP_df %>% unnest_tokens('word', '內容', token = 'words')
tidy_BG <- BG_df %>% unnest_tokens('word', '內容', token = 'words')
tidy_GSP <- GSP_df %>% unnest_tokens('word', '內容', token = 'words')

HP_ntok <- tidy_HP %>% nrow()
BG_ntok <- tidy_BG %>% nrow()
GSP_ntok <- tidy_GSP %>% nrow()

HP_nword <- tidy_HP %>% count(word) %>% nrow()
BG_nword <- tidy_BG %>% count(word) %>% nrow()
GSP_nword <- tidy_GSP %>% count(word) %>% nrow()

tibble(看板 = boards,
         篇數 = c(HP_npost, BG_npost, GSP_npost),
         字數 = c(HP_nchar, BG_nchar, GSP_nchar),
         tokens = c(HP_ntok, BG_ntok, GSP_ntok),
         words = c(HP_nword, BG_nword, GSP_nword)) %>%
  rbind(c('合計', colSums(.[,2:5])))

```


```{r message=FALSE, results='hold', warning=FALSE}

HP_freq <- tidy_HP %>% count(word)
BG_freq <- tidy_BG %>% count(word)
GSP_freq <- tidy_GSP %>% count(word)

count_freq <- function (x, y) {
  if (y %in% x$word) {
    number <- filter(x, word == y)$n
  }
  else {number <- 0}
}
  
tok_com <- tibble(
  動詞 = c('思念', '想念', '懷念'),
  政黑 = c(count_freq(HP_freq, '思念'), count_freq(HP_freq, '想念'), count_freq(HP_freq, '懷念')),
  男女 = c(count_freq(BG_freq, '思念'), count_freq(BG_freq, '想念'), count_freq(BG_freq, '懷念')),
  八卦 = c(count_freq(GSP_freq, '思念'), count_freq(GSP_freq, '想念'), count_freq(GSP_freq, '懷念'))
)

tok_com

tok_com %>% 
  gather(看板, 出現次數, -動詞) %>%
  ggplot(aes(動詞, 出現次數, fill = 看板)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  scale_fill_manual(values = c('#00b81f', '#f8766d', '#00a5ff'))

```


## 關鍵詞語境
```{r message=FALSE, warning=FALSE}

pattern <- '\\b[思想懷]念\\b'

corpus_HP <- corpus(HP_df,
                    docid_field = '編號',
                    text_field = '內容')

kwic(corpus_HP, pattern, valuetype = 'regex') %>% 
  mutate(board = '政黑', id = row_number()) %>%
  select(board, id, pre, keyword, post) %>%
  sample_n(10) %>%
  arrange(id) %>%
  knitr::kable(align = 'c')

corpus_BG <- corpus(BG_df,
                    docid_field = '編號',
                    text_field = '內容')

kwic(corpus_BG, pattern, valuetype = 'regex') %>% 
  mutate(board = '男女', id = row_number()) %>%
  select(board, id, pre, keyword, post) %>%
  sample_n(10) %>%
  arrange(id) %>% 
  knitr::kable(align = 'c') 

corpus_GSP <- corpus(GSP_df,
                    docid_field = '編號',
                    text_field = '內容')

kwic(corpus_GSP, pattern, valuetype = 'regex') %>% 
  mutate(board = '八卦', id = row_number()) %>%
  select(board, id, pre, keyword, post) %>%
  sample_n(10) %>%
  arrange(id) %>% 
  knitr::kable(align = 'c')

```

```{r message=FALSE, results='hold', warning=FALSE}

corpus_PTT <- corpus(PTT_df, text_field = '內容')

kwic(corpus_PTT, '思念') %>% 
  mutate(id = row_number()) %>%
  select(id, pre, keyword, post) %>%
  sample_n(10) %>% 
  arrange(id) %>%
  knitr::kable(align = 'c')

kwic(corpus_PTT, '想念') %>% 
  mutate(id = row_number()) %>%
  select(id, pre, keyword, post) %>%
  sample_n(10) %>% 
  arrange(id) %>%
  knitr::kable(align = 'c')

kwic(corpus_PTT, '懷念') %>% 
  mutate(id = row_number()) %>%
  select(id, pre, keyword, post) %>%
  sample_n(10) %>%
  arrange(id) %>%
  knitr::kable(align = 'c')

```


## 直接賓語分布
```{r message=FALSE, results='hold', warning=FALSE}

# initialize `jiebar`
tagger <- worker(type = 'tag', symbol = F, bylines = F)
segmenter <- worker(symbol = F, bylines = F)

# define own function
tag_text <- function(x) {
  tagger[x] %>%
    paste(names(.), sep = '/', collapse = ' ')
}

seg_text <- function(x) {
  segmenter[x] %>%
    paste(collapse = ' ')
}

PTT_df <- rbind(HP_df, BG_df, GSP_df)

PTT_df_tagged <- PTT_df %>%
  mutate(內容_seg = map_chr(內容, seg_text),
         內容_tag = map_chr(內容, tag_text))

# define regex
pattern_tag <- '\\b[思想懷]念/[a-z]+([^/]+/[a-z]+)*?[^/]+/[nr]'

# Extract patterns
PTT_missing <- PTT_df_tagged %>%
  unnest_tokens(output = construction, 
                input = 內容_tag, 
                token = function(x) str_extract_all(x, pattern = pattern_tag)) %>%
  mutate(pattern = str_remove_all(construction, '/[a-z]+')) %>%
  mutate(obj = str_replace(pattern, '.+\\s([^\\s]+)$', '\\1')) %>%
  select(看板, 編號, pattern, obj)

PTT_missing %>%
  sample_n(10) %>% 
  arrange(看板, 編號) %>%
  knitr::kable(align = 'c')

missing_df_annot <- read_csv('missing_df_annot.csv')
  
missing_df_annot %>% 
  select(board, keyword, object, animacy, repeatability) %>%
  filter(board == '政黑') %>%
  group_by(keyword) %>%
  arrange(keyword) %>%
  top_n(4, object) %>%
  knitr::kable(align = 'c')

missing_df_annot %>% 
  select(board, keyword, object, animacy, repeatability) %>%
  filter(board == '男女') %>%
  group_by(keyword) %>%
  arrange(keyword) %>%
  top_n(4, object) %>%
  knitr::kable(align = 'c')

missing_df_obj_info <- missing_df_annot %>%
  count(object, board, keyword) %>%
  filter(n > 1) %>%
  group_by(board, keyword) %>%
  arrange(board, keyword, -n) %>%
  ungroup

missing_df_obj_info %>%
  knitr::kable(align = 'c')

```


```{r message=FALSE, results='hold', warning=FALSE}

missing_df_obj_info %>%
  mutate(keyword = reorder(keyword, n)) %>%
  ggplot(aes(object, n, fill = board)) +
  geom_bar(stat = 'identity', position = 'dodge') +
  facet_wrap(~keyword, scales = 'free_x') +
  coord_flip() +
  labs(x = '直接賓語', y = '出現次數')

```


## Prepare for collexeme analysis
```{r message=FALSE, results='hold', warning=FALSE}

# word freq
PTT_df_tagged %>%
  unnest_tokens(word, 內容_seg,
                token = function(x) str_split(x, '\\s+|\u3000')) %>%
  filter(nzchar(word)) %>%
  count(word, sort = T) -> PTT_word

# Joint frequency table
PTT_missing %>%
  filter(str_detect(pattern, '思念')) %>%
  count(pattern, sort = T) %>%
  mutate(w1 = str_replace(pattern, '.+\\s([^\\s]+)$', '\\1')) %>%
  mutate(w1_freq = PTT_word$n[match(w1, PTT_word$word)]) -> PTT_sinian_table

PTT_missing %>%
  filter(str_detect(pattern, '想念')) %>%
  count(pattern, sort = T) %>%
  mutate(w1 = str_replace(pattern, '.+\\s([^\\s]+)$', '\\1')) %>%
  mutate(w1_freq = PTT_word$n[match(w1, PTT_word$word)]) -> PTT_xiangnian_table

PTT_missing %>%
  filter(str_detect(pattern, '懷念')) %>%
  count(pattern, sort = T) %>%
  mutate(w1 = str_replace(pattern, '.+\\s([^\\s]+)$', '\\1')) %>%
  mutate(w1_freq = PTT_word$n[match(w1, PTT_word$word)]) -> PTT_huainian_table
  
# prepare for coll analysis
PTT_sinian_table %>%
  select(w1, w1_freq, n) %>%
  write_tsv('sinian.tsv')

PTT_xiangnian_table %>%
  select(w1, w1_freq, n) %>%
  write_tsv('xiangnian.tsv')

PTT_huainian_table %>%
  select(w1, w1_freq, n) %>%
  write_tsv('huainian.tsv')

```

## Run collexeme analysis
```{r eval=FALSE}

# save info in a text
sink('missing_info.txt')
cat('Corpus Size: ', sum(PTT_word$n), '\n')
cat('Freq of sinian: ', sum(PTT_sinian_table$n), '\n')
cat('Freq of xiangnian: ', sum(PTT_xiangnian_table$n), '\n')
cat('Freq of huainian: ', sum(PTT_huainian_table$n), '\n')
sink()

# Create new file
file.create('sinian_results.txt')
file.create('xiangnian_results.txt')
file.create('huainian_results.txt')

source('coll.analysis.r')

# analysis to perform: 1
# name of construction: missing
# corpus size: 5335525
# freq of constructions: 65; 95; 100
# index of association strength: 2 (=log-likelihood)
# sorting: 4 (=collostruction strength)
# decimals: 2
# text file with the raw data: <missing.tsv>
# Where to save output: 1 (= text file)
# output file: <missing_results.txt>

```


## 結果
```{r message=FALSE, results='hold', warning=FALSE}

# load the output txt
sinian_results <- readLines('sinian_results.txt', encoding = 'utf-8') %>%
  .[-c(1:17, (length(.)-17):length(.))]
  
xiangnian_results <- readLines('xiangnian_results.txt', encoding = 'utf-8') %>%
  .[-c(1:17, (length(.)-17):length(.))]

huainian_results <- readLines('huainian_results.txt', encoding = 'utf-8') %>%
  .[-c(1:17, (length(.)-17):length(.))]

# convert into a table
sinian_collo_table <- read_tsv(sinian_results)
xiangnian_collo_table <- read_tsv(xiangnian_results)
huainian_collo_table <- read_tsv(huainian_results)

```


#### Collocation of *思念* in PTT
```{r message=FALSE, results='hold', warning=FALSE}

sinian_collo_table %>%
  arrange(-coll.strength) %>%
  head(10) %>%
  knitr::kable(align = 'c')

```

#### Collocation of *想念* in PTT
```{r message=FALSE, results='hold', warning=FALSE}

xiangnian_collo_table %>%
  arrange(-coll.strength) %>%
  head(10) %>%
  knitr::kable(align = 'c')

```

#### Collocation of *懷念* in PTT
```{r message=FALSE, results='hold', warning=FALSE}

huainian_collo_table %>%
  arrange(-coll.strength) %>%
  head(10) %>%
  knitr::kable(align = 'c')

```


```{r message=FALSE, results='hold', warning=FALSE}

sinian_collo_table %>%
  filter(relation == 'attraction') %>%
  select(words, obs.freq, coll.strength) %>%
  top_n(10, coll.strength) %>%
  ggplot(aes(reorder(words, coll.strength), coll.strength, fill = coll.strength)) +
    geom_col(show.legend = F) +
    coord_flip() +
    labs(x = 'Keywords', 
         y = 'Strength (G2)', 
         title = 'Collocation of 思念 in PTT')+
    theme(text = element_text(family = 'Arial Unicode MS'))

xiangnian_collo_table %>%
  filter(relation == 'attraction') %>%
  select(words, obs.freq, coll.strength) %>%
  top_n(10, coll.strength) %>%
  ggplot(aes(reorder(words, coll.strength), coll.strength, fill = coll.strength)) +
    geom_col(show.legend = F) +
    coord_flip() +
    labs(x = 'Keywords', 
         y = 'Strength (G2)', 
         title = 'Collocation of 想念 in PTT')+
    theme(text = element_text(family = 'Arial Unicode MS'))

huainian_collo_table %>%
  filter(relation == 'attraction') %>%
  select(words, obs.freq, coll.strength) %>%
  top_n(10, coll.strength) %>%
  ggplot(aes(reorder(words, coll.strength), coll.strength, fill = coll.strength)) +
    geom_col(show.legend = F) +
    coord_flip() +
    labs(x = 'Keywords', 
         y = 'Strength (G2)', 
         title = 'Collocation of 懷念 in PTT')+
    theme(text = element_text(family = 'Arial Unicode MS'))

```

## 參考文獻

Brezina, Vaclav. 2018. *Statistics in Corpus Linguistics: A Practical Guide.* Cambridge: Cambridge University Press.   
Desagulier, Guillaume. 2018. *Corpus Linguistics and Statistics with R.* Cham, Switzerland: Springer.  
Gries, Stefan T. 2017. *Quantitative Corpus Linguistics with R: A Practical Introduction.* Second Edition. Routledge.  
洪漢唐&江琼玉. 2020. 小農手作： 語料庫索引與建置. HOCOR 2020.