-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path0_replicate_phylo_and_range_shortfalls.R
391 lines (257 loc) · 17.9 KB
/
0_replicate_phylo_and_range_shortfalls.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
# Code taken from https://github.com/pebgroup/Distribution_of_phylogenetic_data/blob/master/03_phylo_know_analysis_2022.R and variable locations changed
# Note also that use the WCVP as opposed to the WCSP which was used in Rudbeck et al 2022. However, the format is the same and so the code is largely unchanged.
library(data.table)
library(rgdal)
library(tidyr)
wcsp.names <- fread("manual_downloads/WCVP/wcvp_names.txt", quote = "") # WCVP names file
wcsp.distribution <- fread("manual_downloads/WCVP/wcvp_distribution.txt", quote = "") # WCVP distribution file
# Exclude fern and moss families
ferns <- fread("manual_downloads/Darwinian_shortfalls/fern_list.txt", quote = "", header = F, col.names = "family") # list of ferns
moss <- fread("manual_downloads/Darwinian_shortfalls/bryophyta_list.txt", quote = "", header = F, col.names = "family") # list of mosses
wcsp.names <- subset(wcsp.names, !family %in% ferns$family)
wcsp.names <- subset(wcsp.names, !family %in% moss$family)
# Include only accepted entries
wcsp.names.species <- wcsp.names[grepl("Accepted", wcsp.names$taxon_status),]
# Synonymize plant name IDs in distribution data to use accepted plant name IDs, and subset to taxa with accepted plant name IDs
# Merge with accepted names:
wcsp.distribution <- merge(wcsp.distribution, wcsp.names.species[,c("accepted_plant_name_id", "plant_name_id", "taxon_rank")],
by="plant_name_id", all.x=TRUE)
# Remove all entries with no accepted plant name ID
wcsp.dist.subset <- wcsp.distribution[!is.na(wcsp.distribution$accepted_plant_name_id),]
# Remove subsp., var., f. etc that was introduced to the data after synonymizing from distribution data
wcsp.dist.subset <- wcsp.dist.subset[wcsp.dist.subset$taxon_rank == "Species", ]
# Remove extinct and introduced species
wcsp.dist.subset <- subset(wcsp.dist.subset, wcsp.dist.subset$extinct == 0)
wcsp.dist.subset <- subset(wcsp.dist.subset, wcsp.dist.subset$introduced == 0)
# Remove or correct mistaken Area codes
shape <- readOGR("level3.shp")
shape <- readOGR("manual_downloads/Darwinian_shortfalls/level3.shp")
wcsp.dist.subset$area_code_l3 = toupper(wcsp.dist.subset$area_code_l3)
wcsp.dist.subset <- wcsp.dist.subset[wcsp.dist.subset$area_code_l3 %in% shape$LEVEL_3_CO,]
# produce table of WCSP ID's and corresponding names
name.id <- wcsp.names[grepl("Accepted", wcsp.names$taxon_status),]
name.id <- name.id[!(name.id$species == ""), ]
name.id <- subset(name.id, select = c(plant_name_id, taxon_name))
# Replace WCSP ID's with accepted names using the ID and Names object
wcsp.dist.subset$accepted_plant_name <- name.id$taxon_name[match(wcsp.dist.subset$accepted_plant_name_id, name.id$plant_name_id)]
wcsp.dist.subset <- na.omit(wcsp.dist.subset)
# Replace "Ã-" with "x" to indicate accepted hybrids, as this is the format used in GenBank
wcsp.dist.subset <- as.data.frame(sapply(wcsp.dist.subset, gsub, pattern = "Ã-", replacement = "x"))
# ---- Format the WCSP subset into a dataframe that includes L3 areas as columns ---- #
# Create vector of unique L3 area codes to use in the loop
l3.vector <- c(as.character(unique(wcsp.dist.subset$area_code_l3)))
# Create an empty list to fill
area.list <- list()
# Loop and search for all species associated with an l3 region and arrange them in a list
for (i in l3.vector){
inventory <- wcsp.dist.subset$accepted_plant_name[wcsp.dist.subset$area_code_l3 == i]
area.list[[i]] <- inventory
}
# Convert the list into the final dataframe format, which will be used in the "diversity_variables" script, where the response variable and biodiversity-related explanatory variables will be produced.
list.equal.length <- lapply(area.list, 'length<-', max(lengths(area.list)))
wcsp.df <- as.data.frame(list.equal.length) # The table is written in the end of the script
# ---- Create a table of synonyms and their accepted names to be used to translate synonyms from Genbank ---- #
# Retrive records for all not-accepted names and remove entries at genus level
wcsp.synonyms <- wcsp.names[!grepl("Accepted", wcsp.names$taxon_status),]
wcsp.synonyms <- wcsp.synonyms[!(wcsp.synonyms$species == ""), ]
# Combine columns: Genus, Genus hybrid marker, Species hybrid marker, Species, Infraspecific rank and infraspecific epithet to producea new column with the full synonym
wcsp.synonyms <- replace(wcsp.synonyms, wcsp.synonyms == "", NA)
# Remove irrelvant columns
wcsp.synonyms <- subset(wcsp.synonyms, select = c(taxon_name,accepted_plant_name_id))
# Substitute plant ID with plant name from "Names and Id's"
wcsp.synonyms$accepted_plant_name_id <- name.id$taxon_name[match(wcsp.synonyms$accepted_plant_name_id, name.id$plant_name_id)]
wcsp.synonyms <- na.omit(wcsp.synonyms)
# Remove cases where synonym = accepted name to reduce computation time during translation
# Note: These cases occur when the only difference between synonym and accepted name is the "author" columns. This is never included in the name on GenBank, so it is safe to remove these)
wcsp.synonyms <- wcsp.synonyms[wcsp.synonyms$taxon_name != wcsp.synonyms$accepted_plant_name_id,]
# Remove duplicated synonyms
# Note: Again the "author" columns is the culprit. These always refer to the same accepted name, so one is expandable to reduce computation time.
wcsp.synonyms <- wcsp.synonyms[!duplicated(wcsp.synonyms$taxon_name),]
# Remove cases where a synonym is the accepted name of another species.
# Note: This one is complicated. Several species names exists as both synonym and accepted name, but for different species. This is, again, due to differences in the author extension to the synonym, which is not included here, as it does not occur on Genbank. -
# - Including these cases would result in the possible translation of an accepted name into another accepted name, which is undesirable
wcsp.synonyms <- wcsp.synonyms[!(wcsp.synonyms$taxon_name %in% intersect(wcsp.synonyms$accepted_plant_name_id, wcsp.synonyms$taxon_name)),]
# Translate "Ã-" to "x" to adapt to the GenBank download
wcsp.synonyms <- as.data.frame(sapply(wcsp.synonyms, gsub, pattern = "Ã-", replacement = "x"))
# Remove cases where the accepted name is "Unplaced Unplaced"
wcsp.synonyms <- wcsp.synonyms[wcsp.synonyms$accepted_plant_name_id != "Unplaced Unplaced",]
# The final synonym translator file can be written in the end of the script
# ---- Write tables for further use ---- #
# Write a table that includes all accepted species and their ID's as metadata (ID's and Names)
write.table(name.id,"data\\ID_and_Names_2022.csv", sep = ",", row.names = F, quote = F)
# Write a table with the relevant subset of the WCSP distribution data as useful metadata (WCSP long format)
write.table(wcsp.dist.subset,"data\\wcsp_long_2022.csv", sep = ",", row.names = F, col.names = T, quote = F)
# Write a table with the relevant subset of the WCSP distribution data in the format necessary for further analysis (WCSP wide format)
write.table(wcsp.df,"data\\wcsp_wide_2022.csv", sep = ",", row.names = F, col.names = T, quote = F)
# Write a table with synonyms and their accepted names to be used to translate the GenBank download into only accepted names (Synonym translator)
write.table(wcsp.synonyms,"data\\synonym_translator_2022.csv", sep = ",", row.names = F, col.names = T, quote = F)
# ---- Additional information of interest included in the paper ---- #
# The total number of species with distribution data according to WCVP
length(unique(wcsp.dist.subset$accepted_plant_name))
# The total number of species in total according to WCVP
wcsp.total <- wcsp.names.species[wcsp.names.species$taxon_rank == "Species", ]
length(unique(wcsp.total$accepted_plant_name_id))
# % of total number of species with distribution data available
length(unique(wcsp.dist.subset$accepted_plant_name))/length(unique(wcsp.total$accepted_plant_name_id))
# Biodiversity variable creation script
#
# This script uses three datasets produced in the "wcvp_subset_2021" script, two subsets of the GenBank download produced in "genbank_download.R, and BIEN data:
# WCVP:
# 1) ID's and Names
# 2) WCVP distribution data in wide format with countries as columns
# 3) WCVP distribution data in a long format with two columns: species and botanical country
# GenBank:
# 4) a list of species that have at least ONE relevant sequence available
# 5) a list of species that preserves duplicate species if they appear with different molecular markers
# BIEN:
# 6) a list of species with BIEN occurrences for each botanical country, translated into WCSP ID's (BIEN_in_WCSP_regions_Sept21_2020download.RData)
# Botanical countries:
# 7) a table of Level 3 botanical counrty codes and their area size
#
# This script produces multiple explanatory variables related to biodiversity and four response variables at the scale of botanical contries, which is all gathered in one table. These variables include:
# 1) Total number of species with relevent molecular data on GenBank (response)
# 2) Inventory completeness of flora with at least one entry with relevant sequence data on GenBank (response)
# 3) Inventory completeness of flora including all relevant molecular data on GenBank (response)
# 4) Inventory completeness of flora with a record occurrence in BIEN (response)
# 5) Species richness
# 6) Mean number of countries a species of a country is present in
# 7) Median number of countries a species of a country is present in
# 8) Mean range size of species
# 9) Total number of endemic species
# 10) Endemics relative to species richness
# 11) Proporption of endemics sequenced
# Total runtime: ~5 minutes
# ---- Read data from Genbank, WCVP and BIEN ---- #
library(data.table)
gen.data <- fread("manual_downloads/Darwinian_shortfalls/genbank_entries_2022.csv", header = T, quote = "", sep = NULL) # As produced in the genbank_download.R script
load("manual_downloads/Darwinian_shortfalls/BIEN_in_WCSP_regions_sep2021.RData") # R workspace with objects "spec.list" and "res". Spec.list is a list of all L3 regions with the species recorded there in BIEN, translated to WCSP ID's. Res is a df with the lengths of the elements of spec.list.
L3.area <- fread("manual_downloads/Darwinian_shortfalls/L3_and_area.csv", header = T) # Botanical countries and their area size
gen.data.duplicate <- fread("manual_downloads/Darwinian_shortfalls/genbank_entries_w_duplicates_2022.csv", quote = "", header = T, sep = NULL) # As produced in the genbank_download.R script
wcsp.data <- fread("data/wcsp_wide_2022.csv", header = T) # As produced in the wcvp_subset_2021.R script
name.id <- fread("data\\ID_and_Names_2022.csv", header = T) # As produced in the wcvp_subset_2021.R script
wcsp.long <- fread("data\\wcsp_long_2022.csv", quote = "") # As produced in the wcvp_subset_2021.R script
# ---- Prepare vectors and dataframes for the variable creation loops ---- #
# Create vector of area codes and taxa with genetic data
area.vector <- c(colnames(wcsp.data))
genbank.vector <- c(as.character(gen.data$species))
genbank.vector.duplicate <- c(as.character(gen.data.duplicate$species))
# Create dataframe with data for distribution range
dist.range <- table(unlist(wcsp.data))
dist.range.df <- data.frame(dist.range)
dist.range.df.endemics <- dist.range.df[dist.range.df$Freq == 1,]
# Create dataframe of species and area, aggregate and sum it, and create a wide-style dataframe with sqkm replacing species.
wcsp.long$area_sq <- L3.area$AREA_SQKM[match(wcsp.long$area_code_l3, L3.area$LEVEL_3_CO)]
L3.area.only <- subset(wcsp.long, select = c("accepted_plant_name" , "area_sq"))
L3.area.sum <- aggregate(. ~ accepted_plant_name, L3.area.only, sum)
L3.area.sum <- as.data.frame(L3.area.sum)
wide.sq <- wcsp.data
wide.sq[] <- lapply(wcsp.data, function(x) L3.area.sum$area_sq[match(x, L3.area.sum$accepted_plant_name)])
# Convert spec.list and wcsp.data into df, as the loop can't handle it as a list
spec.list.equal.length <- lapply(spec.list, 'length<-', max(lengths(spec.list)))
spec.df <- as.data.frame(spec.list.equal.length)
wcsp.data <- as.data.frame(wcsp.data)
wide.sq <- as.data.frame(wide.sq)
# Create vectors to store results
result.list.total <- c()
result.list.relative <- c()
result.list.diversity <- c()
result.list.distrange.mean <- c()
result.list.distrange.median <- c()
result.list.range.area <- c()
result.list.endemism.total <- c()
result.list.endemism.relative <- c()
result.list.bien <- c()
result.list.endemics.list <- c()
result.list.endemics.seq <- c()
result.list.gen.dup <- c()
# ---- Fill vectors with data for desired variables ---- #
# Number of species in each region with genetic data available (Phylogenetic effort)
for (i in area.vector){
vector01 <- c(as.character(wcsp.data[,i]))
result.list.total[[i]] <- length(intersect(genbank.vector, vector01))
}
# Number of species with genetic data relative to total number of species in each region (Phylogenetic knowledge, main response variable)
for (i2 in area.vector){
vector02 <- c(as.character(wcsp.data[,i2]))
result.list.relative[[i2]] <- length(intersect(genbank.vector, vector02))/length(vector02[!is.na(vector02)])
}
# Total species richness
for (i3 in area.vector){
vector03 <- c(as.character(wcsp.data[,i3]))
result.list.diversity[[i3]] <- length(vector03[!is.na(vector03)])
}
# Mean number of regions in which a species occur for each region (mean range)
for (i4 in area.vector){
vector04 <- c(na.omit(as.character(wcsp.data[,i4])))
result.list.distrange.mean[[i4]] <- mean(dist.range.df$Freq[match(vector04, dist.range.df$Var1)])
}
# Median number of regions in which a species occur for each region (median range)
for (i5 in area.vector){
vector05 <- c(na.omit(as.character(wcsp.data[,i5])))
result.list.distrange.median[[i5]] <- median(dist.range.df$Freq[match(vector05, dist.range.df$Var1)])
}
# Total area of the regions in which a species occur (range area)
for (i8 in area.vector){
vector08 <- c(na.omit(as.numeric(wide.sq[,i8])))
result.list.range.area[[i8]] <- mean(vector08)
}
# Number of species that only occur in the specific region (total endemics)
for (i6 in area.vector){
vector06 <- c(na.omit(as.character(wcsp.data[,i6])))
result.list.endemism.total[[i6]] <- sum((dist.range.df$Freq[match(vector06, dist.range.df$Var1)]) == 1)
}
# Number of endemics relative to total number of species in the region (endemics relative to SR)
for (i7 in area.vector){
vector07 <- c(na.omit(as.character(wcsp.data[,i7])))
result.list.endemism.relative[[i7]] <- (sum((dist.range.df$Freq[match(vector07, dist.range.df$Var1)]) == 1))/length(dist.range.df$Freq[match(vector07, dist.range.df$Var1)])
}
# Number of species with available BIEN data in a region relative to SR (Distribution Knowledge)
for (i in area.vector){
wcsp.acc.name <- name.id$taxon_name[match(spec.df[,i], name.id$plant_name_id)]
wcsp.total <- wcsp.data[,i]
#result.list.bien[[i]] <- length(intersect(wcsp.acc.name, wcsp.total))/length(wcsp.total[!is.na(wcsp.total)])#version from Rudbeck et al.
result.list.bien[[i]] <- length(na.omit(intersect(wcsp.acc.name, wcsp.total)))/length(wcsp.total[!is.na(wcsp.total)]) #corrected version
}
# List of all species that are endemic to a single botanical country
for (i9 in area.vector){
vector09 <- c(na.omit(as.character(wcsp.data[,i9])))
result.list.endemics.list[[i9]] <- dist.range.df.endemics$Var1[match(vector09, dist.range.df.endemics$Var1)]
}
# Format the list created above to become fit for looping
endemism.equal.length <- lapply(result.list.endemics.list, 'length<-', max(lengths(result.list.endemics.list)))
endemism.as.df <- as.data.frame(endemism.equal.length)
# % of endemic species sequenced
for (i10 in area.vector){
vector10 <- c(as.character(endemism.as.df[,i10]))
result.list.endemics.seq[[i10]] <- length(intersect(genbank.vector, vector10))/length(vector10[!is.na(vector10)])
}
# Inventory completeness of phylogenetic knowledge, when completeness is achieved by having available sequences for all 128 relevant markers for each species
for (i11 in area.vector){
vector11 <- c(as.character(wcsp.data[,i11]))
result.list.gen.dup[[i11]] <- length(genbank.vector.duplicate[genbank.vector.duplicate %in% intersect(genbank.vector.duplicate, vector11)])/((length(vector11[!is.na(vector11)]))*128)
}
# Aggregate results into table
# convert vectors to df's
total.df <- data.frame(TOTAL=unlist(result.list.total),
RELATIVE=unlist(result.list.relative),
RICHNESS=unlist(result.list.diversity),
RANGE_MEAN=unlist(result.list.distrange.mean),
RANGE_MEDIAN=unlist(result.list.distrange.median),
RANGE_AREA=unlist(result.list.range.area),
ENDEMISM_T=unlist(result.list.endemism.total),
ENDEMISM_R=unlist(result.list.endemism.relative),
BIEN_OCCUR=unlist(result.list.bien),
ENDEMIC_SEQ=unlist(result.list.endemics.seq),
GEN_DUP=unlist(result.list.gen.dup)
)
total.df <- data.frame(LEVEL_3_CO = row.names(total.df), total.df)
# ---- Create table to be used for logistic regression of the sequencing status of species (1 = sequenced, 0 = not sequenced) on their range size, measured as the sum of the areas (in km^2) of all the botanical countries a species occurs in (logit.Rmd)
L3.area.sum$sequenced <- ifelse(L3.area.sum$accepted_plant_name %in% gen.data$species, 1, 0)
# ---- Write results to be mapped and used in the statistical analysis ---- #
write.table(total.df,"data\\variables_2022.csv",sep = ",", col.names = T, row.names = F, quote = F)
write.table(endemics.seq.df,"data\\endemic_completeness_2022.csv",sep = ",", col.names = F, row.names = T, quote = F)
write.table(L3.area.sum,"data\\seq_area_2022.csv", sep = ",", col.names = T, row.names = F, quote = F)
# ---- Additional information of interest included in the paper ---- #
# % of species with phylogentically relevant data
length(intersect(unique(wcsp.long$accepted_plant_name) , unique(gen.data$species))) # total number
length(intersect(unique(wcsp.long$accepted_plant_name) , unique(gen.data$species)))/length(unique(wcsp.long$accepted_plant_name)) # %