-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_countries.R
243 lines (150 loc) · 8.6 KB
/
get_countries.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
library(parzer)
library(sf)
get_countries <- function(useful_md, tdwg){
#Come up with a single, consensus lat/lon
#First, take decimal lat/long where possible
useful_md %>%
mutate(assigned_lat = case_when(!is.na(`Latitude (decimal degrees)`) ~ `Latitude (decimal degrees)`),
assigned_lon = case_when(!is.na(`Longitude (decimal degrees)`) ~ `Longitude (decimal degrees)`))%>%
filter(!is.na(assigned_lon) & !is.na(assigned_lat))%>%
mutate(assigned_lat = as.numeric(assigned_lat),
assigned_lon = as.numeric(assigned_lon))-> useful_md_working
#check that all longitude grid observations are a subset of longitude
if(any(which(!is.na(useful_md$`Longitude grid`) & is.na(useful_md$Longitude)))){stop("write more code")}
#assign any convenient lat/lon values (ignore the dms format for now)
next_batch <-
useful_md%>%
filter(!ObservationID %in% useful_md_working$ObservationID)
next_batch %>%
mutate(assigned_lat = case_when(!is.na(as.numeric(Latitude)) ~ as.numeric(Latitude))) %>%
mutate(assigned_lon = case_when(!is.na(as.numeric(Longitude)) ~ as.numeric(Longitude))) %>%
filter(!is.na(assigned_lon) & !is.na(assigned_lat))-> next_batch
useful_md_working %>%
bind_rows(next_batch) -> useful_md_working
#handle the harder stuff using a parser
next_batch <-
useful_md%>%
filter(!ObservationID %in% useful_md_working$ObservationID)
next_batch$parse_lat <-
parzer::parse_lat(lat = next_batch$Latitude)
next_batch$parse_lon <-
parzer::parse_lon(lon = next_batch$Longitude)
next_batch %>%
mutate(assigned_lat = case_when(!is.na(parse_lat) ~ as.numeric(parse_lat))) %>%
mutate(assigned_lon = case_when(!is.na(parse_lon) ~ as.numeric(parse_lon))) %>%
filter(!is.na(assigned_lon) & !is.na(assigned_lat))-> next_batch
useful_md_working %>%
bind_rows(next_batch %>%
dplyr::select(colnames(useful_md_working))) -> useful_md_working
#give up on anything without lat/long
next_batch <-
useful_md%>%
filter(!ObservationID %in% useful_md_working$ObservationID)
next_batch %>%
filter(is.na(Latitude) | is.na(Longitude))%>%
mutate(assigned_lat = NA,
assigned_lon = NA) -> next_batch
useful_md_working %>%
bind_rows(next_batch %>%
dplyr::select(colnames(useful_md_working))) -> useful_md_working
#whats left: anomalously formatted lat/lons
next_batch <-
useful_md %>%
filter(!ObservationID %in% useful_md_working$ObservationID)
message(round(nrow(next_batch)/nrow(useful_md)*100,digits = 2), " percent of MD being tossed due to unparsable lat/long")
message("==", round(nrow(next_batch)/length(which(!is.na(useful_md$Latitude) & !is.na(useful_md$Longitude)))*100,digits = 2), " percent of MD with coordinates being tossed due to unparsable lat/long")
# Now, assign botanical country to places with coordinates
useful_md_working %>%
dplyr::select(assigned_lat, assigned_lon) %>%
filter(!is.na(assigned_lat) & !is.na(assigned_lon)) %>%
unique()%>%
st_as_sf(coords = c("assigned_lon","assigned_lat"),
crs = st_crs("WGS84"),
remove = FALSE) -> point_data
tdwg_countries <- st_intersection(x = point_data,
y = st_make_valid(tdwg))
tdwg_countries %>%
st_set_geometry(NULL) -> tdwg_countries
useful_md_working %>%
left_join(y = tdwg_countries) -> useful_md_working
rm(tdwg_countries, point_data)
gc()
# use GNRS to clean both the the tdwg and TRY country/state names and try to match them up
library(GNRS)
useful_md_working %>%
dplyr::select(`Location Country`,`Location: state`) %>%
unique() -> try_countries
GNRSed_TRY_countries <-
GNRS_super_simple(country = try_countries$`Location Country`,
state_province = try_countries$`Location: state`)
GNRSed_TRY_countries %>%
filter(!is.na(country)) %>%
filter(country != "") %>%
dplyr::select(country_verbatim, state_province_verbatim, country,state_province) -> GNRSed_TRY_countries
#First, we'll try to match TDWG level names
gnrsed_bot_countries <- GNRS_super_simple(country = tdwg$LEVEL_NAME)
gnrsed_bot_countries %>%
filter(country != "") -> gnrsed_bot_countries
gnrsed_bot_countries <-
gnrsed_bot_countries %>%
filter(match_score_country > .95 | #keep anything with a high country match score
grepl(pattern = "Island",x = country)) %>% #keep islands (the abbreviation hurts the score)
filter(match_score_state_province == "") #this filter excludes NSW from showing up as Wales
#Next, try to get any that correspond to states
gnrsed_bot_countries_v2 <- GNRS_super_simple(country = tdwg$REGION_NAM,state_province = tdwg$LEVEL_NAME)
gnrsed_bot_countries_v2 <-
gnrsed_bot_countries_v2 %>%
filter(!user_id %in% gnrsed_bot_countries$user_id) %>% #toss anything already matched
filter(country != "" & state_province != "") %>% #toss anything that wasn't resolved
filter(!country_verbatim %in% c("Southern Africa")) %>% #erroneously resolved to South Africa
filter(!country %in% c("Mexico")) #toss mexico (erroneously resolves states)
gnrsed_bot_countries %>%
bind_rows(gnrsed_bot_countries_v2) -> gnrsed_bot_countries_total
rm(gnrsed_bot_countries,gnrsed_bot_countries_v2)
#Now merge the TRY countries to the TDWG countries to create a name key
GNRSed_TRY_countries %>%
rename(TRY_country_name = country_verbatim,
TRY_state_name = state_province_verbatim) %>%
merge(y = gnrsed_bot_countries_total %>% rename(TDWG_country_name = country_verbatim, TDWG_state_name = state_province_verbatim ),
by = c("country","state_province") ) -> name_key
rm(GNRSed_TRY_countries, gnrsed_bot_countries_total)
#Merge the useful fields (TDWG_country_name) of the name key to the useful_md_working
name_key %>%
dplyr::select(TRY_country_name,
TRY_state_name,
TDWG_country_name,
TDWG_state_name) %>%
right_join(y = useful_md %>% mutate(`Location: state` = replace_na(`Location: state`,"")),
by = c("TRY_country_name" = "Location Country",
"TRY_state_name" = "Location: state"),
keep=TRUE) -> useful_md_plus
# Merge the useful fields of the extracted bits to the full dataset
useful_md_working %>%
rename("TDWG_country_imputed" = "LEVEL_NAME") %>%
dplyr::select("ObservationID", "TDWG_country_imputed")%>%
right_join(y = useful_md_plus,
by = "ObservationID") -> useful_md_working
rm(useful_md_plus)
# Where we can infer country from multiple sources, how many are accurate?
message("\n ",round(length(which(useful_md_working$TDWG_country_imputed != useful_md_working$TDWG_country_name))/
length(which(!is.na(useful_md_working$TDWG_country_imputed) & !is.na(useful_md_working$TDWG_country_name))),digits = 2)*100,
" percent of countries inferrable from both points and declared country do not match, tossing these")
useful_md_working %>%
filter(is.na(TDWG_country_imputed) | is.na(TDWG_country_name)|
TDWG_country_imputed == TDWG_country_name) -> useful_md_working
#Toss anything where we couldn't infer a country
useful_md_working %>%
filter(!is.na(TDWG_country_imputed) | !is.na(TDWG_country_name)) -> useful_md_working
#Now, assign a single country to the md
useful_md_working %>%
dplyr::select(ObservationID, TDWG_country_imputed, TDWG_country_name) -> useful_md_working
useful_md_working %>%
mutate(LEVEL_NAME = case_when(!is.na(TDWG_country_imputed) ~ TDWG_country_imputed,
!is.na(TDWG_country_name) & is.na(TDWG_country_imputed) ~ TDWG_country_name)) %>%
dplyr::select(ObservationID, LEVEL_NAME) -> useful_md_working
# Append the assigned country to the md and return
useful_md %>%
left_join(y = useful_md_working,
by = "ObservationID")-> useful_md
return(useful_md)
}