-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathmerge.Rmd
262 lines (218 loc) · 7.32 KB
/
merge.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
---
title: "Merge Indications"
output:
html_document:
theme: cosmo
highlight: pygments
---
```{r, message=FALSE}
library(readr)
library(dplyr)
library(DT)
```
## Read input for compound mapping
```{r}
# Read DrugBank
drugbank_df <- 'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank.tsv' %>%
readr::read_tsv() %>%
dplyr::transmute(drugbank_id, drugbank_name = name)
# Retreive DrugBank Slim compounds
drugbank_slim <- readr::read_tsv('https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/drugbank-slim.tsv')$drugbank_id
# Construct a rxnorm to drugbank mapping (through FDA-SRS UNII)
rxnorm_df <- dplyr::inner_join(
file.path('data', 'rxcui-unii-map.tsv') %>%
readr::read_tsv(),
'https://raw.githubusercontent.com/dhimmel/drugbank/3e87872db5fca5ac427ce27464ab945c0ceb4ec6/data/mapping/fdasrs.tsv' %>%
readr::read_tsv()
)
```
## Read input for disease mapping
```{r}
# read the file of Disease Ontology terms
do_df <-
'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/term-names.tsv' %>%
readr::read_tsv() %>%
dplyr::filter(type == 'name') %>%
dplyr::transmute(doid_code = doid, doid_name = name)
# Retreive DO Slim diseases
do_slim <- readr::read_tsv('https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms-prop.tsv')$subsumed_id %>% unique()
# read disease ontology slim term propagations
doslim_map_df <-
'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/slim-terms-prop.tsv' %>%
readr::read_tsv()
# read disease ontology mappings
domap_df <-
'https://raw.githubusercontent.com/dhimmel/disease-ontology/72614ade9f1cc5a5317b8f6836e1e464b31d5587/data/xrefs.tsv' %>%
readr::read_tsv()
# extract the DO to UMLS mapping
umls_df <- domap_df %>%
dplyr::filter(resource == 'UMLS') %>%
dplyr::select(-resource) %>%
dplyr::rename(disease_cui = resource_id)
# extract the DO to ICD9 mapping
icd9_df <- domap_df %>%
dplyr::filter(resource == 'ICD9') %>%
dplyr::select(-resource) %>%
dplyr::rename(disease_icd9 = resource_id)
# extract the DO to OMIM mapping
omim_df <- domap_df %>%
dplyr::filter(resource == 'OMIM') %>%
dplyr::select(-resource) %>%
dplyr::rename(disease_omim = resource_id) %>%
dplyr::mutate(disease_omim = as.integer(disease_omim))
```
## Read and map labeledin
```{r}
labin_df <-
# read labeledin data
file.path('labeledin', 'data', 'indications.tsv') %>%
readr::read_tsv(col_types = list(rxnorm_id = readr::col_character())) %>%
# removes combo drugs which do not convert to integer
dplyr::mutate(rxnorm_id = as.integer(rxnorm_id)) %>%
dplyr::filter(! is.na(rxnorm_id)) %>%
# map umls diseases to DO
dplyr::inner_join(umls_df) %>%
# map rxnorm compounds to drugbank
dplyr::inner_join(rxnorm_df)
```
## Read and map MEDI
```{r}
medi_df <-
file.path('medi', 'data', 'medi-umls.tsv') %>%
readr::read_tsv() %>%
dplyr::inner_join(rxnorm_df)
medi_df <- dplyr::bind_rows(
umls_df %>%
dplyr::inner_join(medi_df),
icd9_df %>%
dplyr::inner_join(medi_df)
)
```
## Read and map PREDICT
```{r}
predict_df <-
file.path('msb-predict', 'data', 'indications-umls.tsv') %>%
readr::read_tsv() %>%
dplyr::rename(disease_cui = umls_cui, disease_omim = omim_id)
predict_df <- dplyr::bind_rows(
umls_df %>%
dplyr::inner_join(predict_df),
omim_df %>%
dplyr::inner_join(predict_df)
)
```
## Read and map ehrlink
```{r}
ehrlink_df <-
file.path('ehrlink', 'data', 'indications.tsv') %>%
readr::read_tsv() %>%
dplyr::rename(rxnorm_id = ingredient_rxcui) %>%
dplyr::inner_join(rxnorm_df)
# ehrlink.df <- ehrlink.df %>%
# dplyr::rename(subsumed_id = doid_code, subsumed_name = doid_name) %>%
# dplyr::inner_join(
# doslim.df %>% dplyr::transmute(subsumed_id, doid_code = slim_id, doid_name = slim_name)
# ) %>%
# dplyr::inner_join(rxnorm.df %>% dplyr::rename(ingredient_rxcui = rxnorm_id))
```
## Join resources
```{r}
indication_df <- dplyr::bind_rows(
# LabeledIn
labin_df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'labeledin'),
# MEDI
medi_df %>%
dplyr::group_by(doid_code, drugbank_id) %>%
dplyr::summarize(
resource = ifelse(max(hps), 'medi_hps', 'medi_lps')
) %>%
dplyr::ungroup(),
# PREDICT
predict_df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'predict'),
# ehrlink
ehrlink_df %>%
dplyr::select(doid_code, drugbank_id) %>%
dplyr::distinct() %>%
dplyr::mutate(resource = 'ehrlink')
) %>%
dplyr::arrange(doid_code, drugbank_id, resource)
# add compound and disease names
indication_df <- indication_df %>%
dplyr::inner_join(drugbank_df) %>%
dplyr::left_join(do_df)
# save
indication_df %>% readr::write_tsv(file.path('data', 'indications-verbose.tsv'))
indication_df %>% DT::datatable(rownames=F)
```
```{r}
indication_slim_df <- indication_df %>%
dplyr::filter(drugbank_id %in% drugbank_slim) %>%
dplyr::filter(doid_code %in% do_slim) %>%
dplyr::rename(do_subsumed_id = doid_code, do_subsumed_name = doid_name) %>%
dplyr::inner_join(
doslim_map_df %>%
dplyr::transmute(do_slim_id = slim_id, do_slim_name = slim_name, do_subsumed_id = subsumed_id)
) %>%
dplyr::select(drugbank_id, drugbank_name, do_slim_id, do_slim_name, do_subsumed_id, do_subsumed_name, resource) %>%
dplyr::arrange(drugbank_id, do_slim_id, do_subsumed_id, resource)
# save sourced indications
indication_slim_df %>% readr::write_tsv(file.path('data', 'indications-slim-verbose.tsv'))
```
## Convert to a single row per compound-disease pair. Include only high confidence indications
<a name="indication-table"></a>
```{r}
hc_sources <- c('medi_hps', 'ehrlink', 'labeledin', 'predict')
collapsed_df <- indication_slim_df %>%
dplyr::filter(resource %in% hc_sources) %>%
dplyr::rename(doid_id = do_slim_id, doid_name = do_slim_name) %>%
dplyr::group_by(drugbank_id, drugbank_name, doid_id, doid_name) %>%
dplyr::summarize(
distinct_resources = n_distinct(resource),
total_resources = n()
) %>% dplyr::ungroup()
# save
collapsed_df %>% readr::write_tsv(file.path('data', 'indications-slim-collapsed.tsv'))
collapsed_df %>% DT::datatable(rownames=F)
```
## Indications per disease
```{r}
collapsed_df %>%
dplyr::group_by(doid_id, doid_name) %>%
dplyr::summarize(
'n_indications' = n()
) %>%
DT::datatable(rownames=F)
```
## Indications per compound
```{r}
collapsed_df %>%
dplyr::group_by(drugbank_id, drugbank_name) %>%
dplyr::summarize(
'n_indications' = n()
) %>%
DT::datatable(rownames=F)
```
## Create a dataset for the curators
```{r}
curation_df <- collapsed_df %>%
dplyr::transmute(
drug = drugbank_name,
disease = doid_name,
classification = '',
notes = '',
drug_url = paste0('http://www.drugbank.ca/drugs/', drugbank_id),
disease_url = paste0('http://www.disease-ontology.org/term/', sub(':', '%3A', doid_id))
) %>% dplyr::arrange(drug, disease)
curation_df %>% readr::write_tsv(file.path('curation', 'template', 'template.tsv'))
set.seed(0)
curation_df %>%
dplyr::sample_n(50) %>%
dplyr::arrange(drug, disease) %>%
readr::write_tsv(file.path('curation', 'pilot', 'curation.tsv'))
```