analysis/wos_cr_match.Rmd

---
title: "WoS Crossref Journal Match"
output: github_document
---


```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE, 
                      warning = FALSE, 
                      message = FALSE)
```

## Retrieve WoS-KB journal infos

Connect to database

```{r}
# deal with rJava memory allocation 
# https://stackoverflow.com/questions/34624002/r-error-java-lang-outofmemoryerror-java-heap-space
options(java.parameters = "-Xmx1024m")
require(tidyverse)
require(RJDBC)
require(rJava)
.jinit()
jdbcDriver <-
  JDBC(driverClass = "oracle.jdbc.OracleDriver", classPath = "../inst/jdbc_driver/ojdbc8.jar")
jdbcConnection <-
  dbConnect(
    jdbcDriver,
    "jdbc:oracle:thin:@//biblio-p-db01:1521/bibliodb01.fiz.karlsruhe",
    Sys.getenv("kb_user"),
    Sys.getenv("kb_pwd")
  ) 
```

Search criteria 

- Database: wos_b_2019
- Document Types: Articles and Reviews
- Database Collections: `WOS.SCI`, `WOS.SSCI`, `WOS.AHCI`
- Publication Period 2014 - 2018

#### Query 

```{sql connection=jdbcConnection, output.var="wos_jn"}
select
        wos_b_2019.items.pubyear,
        wos_b_2019.issues.issn,
        count(distinct(ut_eid)) as pubs                         
    from
        wos_b_2019.items                                            
    inner join
        wos_b_2019.databasecollection                                                                                          
            on wos_b_2019.databasecollection.fk_items =  wos_b_2019.items.pk_items                                  
    inner join
        wos_b_2019.issues                                                        
            on wos_b_2019.issues.pk_issues = wos_b_2019.items.fk_issues                             
    where
        wos_b_2019.databasecollection.edition_value in (
            'WOS.SCI', 'WOS.SSCI', 'WOS.AHCI'                                                   
        )                                                                              
        and wos_b_2019.items.doctype in (
            'Article', 'Review'                                                   
        )                                                                               
        and wos_b_2019.items.pubyear in (
            2014, 2015, 2016, 2017, 2018                                                   
        )                                                  
    group by
        wos_b_2019.items.pubyear,
        wos_b_2019.issues.issn
```

Backup 

```{r}
readr::write_csv(wos_jn, "../data/wos_jns.csv")
```

#### Populate with ISSN info including ISSN-L

```{r}
# load issn l list
wos_jn <- readr::read_csv("../data/wos_jns.csv")
issn_l <- readr::read_tsv("../data/20190818.ISSN-to-ISSN-L.txt")
issn_wos_match <- wos_jn %>%
  distinct(ISSN) %>%
  left_join(issn_l, by = c("ISSN")) 
issn_variants <-
  issn_wos_match %>% 
  inner_join(issn_l, by = c("ISSN-L")) %>%
  distinct(`ISSN-L`, ISSN = ISSN.y)
issn_variants
```

#### Crossref Match 

Per every ISSN-L, obtain all ISSN variants and query Crossref for most frequent publisher and journal title

Generate queries

```{r}
issns_list <-
  purrr::map(unique(issn_variants$`ISSN-L`), function(x) {
    issn_l <- x
    issns <- issn_variants %>%
      filter(`ISSN-L` %in% issn_l) %>%
      .$ISSN
    names(issns) <- rep("issn", length(issns))
    issns
  })
```

Define and call function

```{r}
#' search crossref
require(rcrossref)
jn_facets <- plyr::llply(issns_list, purrr::safely(function(x) {
  tt <- rcrossref::cr_works(
    filter = c(
      x,
      from_pub_date = "2014-01-01",
      until_pub_date = "2018-12-31",
      type = "journal-article"
    ),
    # being explicit about facets improves API performance!
    facet = "container-title:*,publisher-name:*",
    # less api traffic
    select = "DOI"
  )
  #' Parse the relevant information
  #' - `journal_title` - Crossref journal title (in case of journal name change, we use the most frequent name)
  #' - `publisher` - Crossref publisher (in case of publisher name change, we use the most frequent name)
  #'
  #' To Do: switch to current potential
  if (!is.null(tt)) {
    tibble::tibble(
      issn = list(x),
      journal_title = tt$facets$`container-title`$.id[1],
      publisher = tt$facets$publisher$.id[1]
    )
  } else {
    NULL
  }
}), .progress = "text")
```

```{r}
#' Dump:
jn_facets_df <- purrr::map_df(jn_facets, "result")
cr_jn <- jn_facets_df %>% 
  unnest(issn)
readr::write_csv(cr_jn, "../data/cr_wos.csv")
```

#### ADD ISSN-L

```{r}
cr_journals <- readr::read_csv("../data/cr_wos.csv") %>%
  # merge springer nature brands
  mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE), "Springer Nature", publisher))
rp_country <- readr::read_csv("../data/rp_data_14_18.csv")
issn_l <- readr::read_tsv("../data/20190818.ISSN-to-ISSN-L.txt") %>%
  # manual fix Journal - American Water Works 
  add_row(ISSN = "2164-4535", `ISSN-L` = "0003-150X")
rp_df <- rp_country %>%
  left_join(issn_l, by = "ISSN") %>%
  left_join(cr_journals, by = c(ISSN = "issn")) %>%
  distinct()
```

#### Add OA info

```{r}
u <- "https://pub.uni-bielefeld.de/download/2934907/2934908/ISSN_Gold-OA_3.0.csv"
bie_oa <- readr::read_csv(u) %>%
  # remove missing entries with missing ISSN_L
  filter(!is.na(ISSN_L))
# add info to rp_df
rp_df_oa <- rp_df %>%
  mutate(oa_journal = `ISSN-L` %in% bie_oa$`ISSN_L`)
```