-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathwos_cr_match.Rmd
182 lines (158 loc) · 5.24 KB
/
wos_cr_match.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
---
title: "WoS Crossref Journal Match"
output: github_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE,
warning = FALSE,
message = FALSE)
```
## Retrieve WoS-KB journal infos
Connect to database
```{r}
# deal with rJava memory allocation
# https://stackoverflow.com/questions/34624002/r-error-java-lang-outofmemoryerror-java-heap-space
options(java.parameters = "-Xmx1024m")
require(tidyverse)
require(RJDBC)
require(rJava)
.jinit()
jdbcDriver <-
JDBC(driverClass = "oracle.jdbc.OracleDriver", classPath = "../inst/jdbc_driver/ojdbc8.jar")
jdbcConnection <-
dbConnect(
jdbcDriver,
"jdbc:oracle:thin:@//biblio-p-db01:1521/bibliodb01.fiz.karlsruhe",
Sys.getenv("kb_user"),
Sys.getenv("kb_pwd")
)
```
Search criteria
- Database: wos_b_2019
- Document Types: Articles and Reviews
- Database Collections: `WOS.SCI`, `WOS.SSCI`, `WOS.AHCI`
- Publication Period 2014 - 2018
#### Query
```{sql connection=jdbcConnection, output.var="wos_jn"}
select
wos_b_2019.items.pubyear,
wos_b_2019.issues.issn,
count(distinct(ut_eid)) as pubs
from
wos_b_2019.items
inner join
wos_b_2019.databasecollection
on wos_b_2019.databasecollection.fk_items = wos_b_2019.items.pk_items
inner join
wos_b_2019.issues
on wos_b_2019.issues.pk_issues = wos_b_2019.items.fk_issues
where
wos_b_2019.databasecollection.edition_value in (
'WOS.SCI', 'WOS.SSCI', 'WOS.AHCI'
)
and wos_b_2019.items.doctype in (
'Article', 'Review'
)
and wos_b_2019.items.pubyear in (
2014, 2015, 2016, 2017, 2018
)
group by
wos_b_2019.items.pubyear,
wos_b_2019.issues.issn
```
Backup
```{r}
readr::write_csv(wos_jn, "../data/wos_jns.csv")
```
#### Populate with ISSN info including ISSN-L
```{r}
# load issn l list
wos_jn <- readr::read_csv("../data/wos_jns.csv")
issn_l <- readr::read_tsv("../data/20190818.ISSN-to-ISSN-L.txt")
issn_wos_match <- wos_jn %>%
distinct(ISSN) %>%
left_join(issn_l, by = c("ISSN"))
issn_variants <-
issn_wos_match %>%
inner_join(issn_l, by = c("ISSN-L")) %>%
distinct(`ISSN-L`, ISSN = ISSN.y)
issn_variants
```
#### Crossref Match
Per every ISSN-L, obtain all ISSN variants and query Crossref for most frequent publisher and journal title
Generate queries
```{r}
issns_list <-
purrr::map(unique(issn_variants$`ISSN-L`), function(x) {
issn_l <- x
issns <- issn_variants %>%
filter(`ISSN-L` %in% issn_l) %>%
.$ISSN
names(issns) <- rep("issn", length(issns))
issns
})
```
Define and call function
```{r}
#' search crossref
require(rcrossref)
jn_facets <- plyr::llply(issns_list, purrr::safely(function(x) {
tt <- rcrossref::cr_works(
filter = c(
x,
from_pub_date = "2014-01-01",
until_pub_date = "2018-12-31",
type = "journal-article"
),
# being explicit about facets improves API performance!
facet = "container-title:*,publisher-name:*",
# less api traffic
select = "DOI"
)
#' Parse the relevant information
#' - `journal_title` - Crossref journal title (in case of journal name change, we use the most frequent name)
#' - `publisher` - Crossref publisher (in case of publisher name change, we use the most frequent name)
#'
#' To Do: switch to current potential
if (!is.null(tt)) {
tibble::tibble(
issn = list(x),
journal_title = tt$facets$`container-title`$.id[1],
publisher = tt$facets$publisher$.id[1]
)
} else {
NULL
}
}), .progress = "text")
```
```{r}
#' Dump:
jn_facets_df <- purrr::map_df(jn_facets, "result")
cr_jn <- jn_facets_df %>%
unnest(issn)
readr::write_csv(cr_jn, "../data/cr_wos.csv")
```
#### ADD ISSN-L
```{r}
cr_journals <- readr::read_csv("../data/cr_wos.csv") %>%
# merge springer nature brands
mutate(publisher = ifelse(grepl("Springer", publisher, fixed = FALSE), "Springer Nature", publisher))
rp_country <- readr::read_csv("../data/rp_data_14_18.csv")
issn_l <- readr::read_tsv("../data/20190818.ISSN-to-ISSN-L.txt") %>%
# manual fix Journal - American Water Works
add_row(ISSN = "2164-4535", `ISSN-L` = "0003-150X")
rp_df <- rp_country %>%
left_join(issn_l, by = "ISSN") %>%
left_join(cr_journals, by = c(ISSN = "issn")) %>%
distinct()
```
#### Add OA info
```{r}
u <- "https://pub.uni-bielefeld.de/download/2934907/2934908/ISSN_Gold-OA_3.0.csv"
bie_oa <- readr::read_csv(u) %>%
# remove missing entries with missing ISSN_L
filter(!is.na(ISSN_L))
# add info to rp_df
rp_df_oa <- rp_df %>%
mutate(oa_journal = `ISSN-L` %in% bie_oa$`ISSN_L`)
```