-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathPSSSClean.R
82 lines (61 loc) · 4.84 KB
/
PSSSClean.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
#Code used to clean the PSSS and prepare it for analysis
#Created by Danielle Ethier
#December 2024
in.PSSS <- read.csv("Data/PSSS.csv") # reads in back-up copy of database
in.PSSS$SpeciesCode<-as.factor(in.PSSS$SpeciesCode)
in.PSSS <- subset(in.PSSS, !is.na(SpeciesCode))
# filter data by months October to April
in.PSSS <- subset(in.PSSS, MonthCollected %in% c(10:12, 1:4))
# if duplicate records in data file; this keeps only one. There currently are no doublicates.
in.PSSS <- distinct(in.PSSS)
#create day of year column
in.PSSS$doy <- as.numeric(format(as.Date(paste(in.PSSS$YearCollected, in.PSSS$MonthCollected, in.PSSS$DayCollected, sep="-")), "%j"))
# keep only one survey per month by selecting the first survey if there are duplicates. There are currently none.
in.PSSS <- in.PSSS %>% group_by(SurveyAreaIdentifier, YearCollected, MonthCollected) %>% slice_min(doy) %>% ungroup()
# create a new column called wyear, which groups surveys by winter year
# for example, January-April 2005 would be wyear 2004
in.PSSS$wyear <- ifelse(in.PSSS$MonthCollected %in% c(1:4), in.PSSS$YearCollected-1,
in.PSSS$YearCollected)
# Because there are errors in the Duration in Hours column, I will adjust the TimeObservationsEnded and TimeObservationStarted column.
# Specifically, any value that is < 6 AM I will add 12 to bring it into 24 hour time.
# Remove missing TimeObservationsStarted and TimeObservationsEnded
in.PSSS <- in.PSSS %>% filter(!is.na(TimeObservationsStarted) & !is.na(TimeObservationsEnded))
# Remove times with less than 3 nchar
in.PSSS <- in.PSSS %>% filter(nchar(TimeObservationsStarted) > 2 & nchar(TimeObservationsEnded) > 2)
# REmove times with greater tahn 4 nchar
in.PSSS <- in.PSSS %>% filter(nchar(TimeObservationsStarted) < 5 & nchar(TimeObservationsEnded) < 5)
#convert time into decimal hours
in.PSSS <- in.PSSS %>% mutate(DecimalTimeObservationsStarted = (TimeObservationsStarted %/% 100)+((TimeObservationsStarted %% 100)/60), DecimalTimeObservationsEnded = (TimeObservationsEnded %/% 100)+((TimeObservationsEnded %% 100)/60))
# Calculate the duration
in.PSSS$DurationInHours2 <- calculate_duration(in.PSSS$DecimalTimeObservationsStarted, in.PSSS$DecimalTimeObservationsEnded)
# if DurationinHours is negative, we want to swap the survey start time and survey end time as they seem to be reversed.
in.PSSS<- in.PSSS %>% mutate(DecimalTimeObservationsStarted2= ifelse(in.PSSS$DurationInHours2 < 0, DecimalTimeObservationsEnded, DecimalTimeObservationsStarted), DecimalTimeObservationsEnded2= ifelse(in.PSSS$DurationInHours2 < 0, DecimalTimeObservationsStarted, DecimalTimeObservationsEnded))
# recalculate duration in hours using the start and end times
in.PSSS$DurationInHours <- calculate_duration(in.PSSS$DecimalTimeObservationsStarted2, in.PSSS$DecimalTimeObservationsEnded2)
#remove remaining negative DurationInHours
in.PSSS <- in.PSSS %>% filter(DurationInHours >= 0)
#Filter Duration in hours greater than 0.3 and less than 10
in.PSSS<-in.PSSS[in.PSSS$DurationInHours > 0.3 & in.PSSS$DurationInHours < 3,]
# create a new column called wyear, which groups surveys by winter year
# for example, January-April 2005 would be wyear 2004
in.PSSS$wyear <- ifelse(in.PSSS$MonthCollected %in% c(1:4), in.PSSS$YearCollected-1,
in.PSSS$YearCollected)
# filter data by years
in.PSSS <- subset(in.PSSS, wyear >= Y1 & wyear <= Y2)
#Remove ObservationCounts that are NA
in.PSSS <- in.PSSS %>% filter(!is.na(ObservationCount))
#Remove missing SurveyAreaIdentifiers
in.PSSS <- in.PSSS %>% filter(!is.na(SurveyAreaIdentifier))
#Remove missing DecimalLatitude or DecimalLongitude
in.PSSS <- in.PSSS %>% filter(!is.na(DecimalLatitude) & !is.na(DecimalLongitude))
#Filter events to 45.06N to 50.64N latitude and 125.07W to 115.15W longitude
in.PSSS <- in.PSSS %>% filter(DecimalLatitude >= 45.06 & DecimalLatitude <= 50.64 & DecimalLongitude >= -125.07 & DecimalLongitude <= -115.15)
# create an events matrix for future zero filling
event.PSSS <- in.PSSS %>% dplyr::select(ProjectCode, SurveyAreaIdentifier, wyear, YearCollected, MonthCollected, DayCollected, DecimalLatitude, DecimalLongitude, DurationInHours) %>% distinct()
# if there are multiple events in a single day (now caused by Duration in Hours), take the minimum
event.PSSS <- event.PSSS %>% group_by(ProjectCode, SurveyAreaIdentifier, wyear, YearCollected, MonthCollected, DayCollected) %>% slice_min(DurationInHours) %>% ungroup()
# retain columns that are needed for the analysis
in.PSSS <- in.PSSS %>% dplyr::select(ProjectCode, SurveyAreaIdentifier, SpeciesCode, ObservationCount, wyear, YearCollected, MonthCollected, DayCollected)
# write index.data to file
write.csv(in.PSSS, "Data/PSSS.clean.csv")
write.csv(event.PSSS, "Data/PSSS.events.csv")