-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2017-08-18_tidy-verse-dc.R
149 lines (124 loc) · 4.36 KB
/
2017-08-18_tidy-verse-dc.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
##Lesly Lopez
##Tidy data in dc workshop
##08/18/17
##Read in Portal data into R ----
surveys <- read.csv("data/portal_data_joined.csv")
##Install tidy verse package
#install.packages('tidyverse')
##Load package
library(tidyverse)
##Select the columns plot_id, species_id and weight
select(surveys, plot_id, species_id, weight)
#Filter rows where year is 1995
filter(surveys, year == 1995)
##Pipes => %>% ----
##Idea f(g o x)
surveys1995 <- surveys %>%
filter(year == 1995) %>% #filter is for rows
select(year, plot_id, species_id, weight) #select is for columns
##Adding a column for weight (kg) with mutate
surveys %>%
filter(!is.na(weight)) %>% ##Remove N/A by adding filter
mutate(weight_kg = weight/1000, #mutate adds a column
weight_kg2 = weight_kg * 2) %>% #add two columns at once
tail
##Challenge
##Create a new data frame from the surveys data that meets the following criteria:
##contains only the species_id column and a new column called hindfoot_half
##containing values that are half the hindfoot_length.
##In this hindfoot_half column, there are no NAs and all values are less than 30.
##Note: names(surveys) colnames(surveys) returns names of types of data
#Method 1
surveys_challenge <- surveys %>%
filter(!is.na(hindfoot_length)) %>%
mutate(hindfoot_half = hindfoot_length/2) %>%
filter(hindfoot_half<30) %>%
select(species_id, hindfoot_half)
#Method 2
surveys_challenge2 <- surveys %>%
mutate(hindfoot_half = hindfoot_length/2) %>%
filter(hindfoot_half<30 & !is.na(hindfoot_half)) %>% #or replace & w/ ,
select(species_id, hindfoot_half)
##Check info is correct
head(surveys_challenge)
dim(surveys_challenge)
##group_by and summarize functions ----
##When you want to summarize, you use group_by
##Do data maninupalation to get something out of the raw data
##Help -> Cheat sheets -> Data manipulation
surveys %>%
group_by(sex) %>%
summarize(mean_weight = mean(weight, na.rm = TRUE))
##Add species name
surveys %>%
filter(!is.na(weight), sex =="F" | sex == "M") %>%
group_by(species_id, sex) %>%
summarize(mean_weight = mean(weight), min_weight = min(weight))
##Tally ----
##Tally: get total count
surveys %>%
group_by(sex) %>%
tally
## Challenge
## 1. How many individuals were caught in each plot_type surveyed?
surveys %>%
group_by(plot_type) %>%
tally
## 2. Use group_by() and summarize() to find the mean, min, and
## max hindfoot length for each species (using species_id).
surveys %>%
group_by(species_id) %>%
filter(!is.na(hindfoot_length)) %>%
summarize(mean_hindfoot = mean(hindfoot_length),
min_hindfoot = min(hindfoot_length), max_hindfoot = max(hindfoot_length))
## 3. What was the heaviest animal measured in each year? Return
## the columns year, genus, species_id, and weight.
#Method 1
surveys_max_weight <- surveys %>%
group_by(year) %>%
filter(!is.na(weight)) %>%
filter(weight==max(weight)) %>%
select(year,genus,species_id,weight) %>%
arrange(year) #arrange by year
tally(surveys_max_weight)
##Method 2
surveys %>%
select(year,genus,species_id,weight) %>%
group_by(year) %>%
top_n(1,weight)
##top_n get the top number
#Alternative Method 1
surveys %>%
group_by(year) %>%
filter(weight==max(weight), na.rm=TRUE) %>%
select(year,genus,species_id,weight) %>%
arrange(year) #arrange by year
tally(surveys_max_weight)
## 4. You saw above how to count the number of individuals of each sex using a
## combination of group_by() and tally(). How could you get the same result using
## group_by() and summarize()? Hint: see ?n.
surveys %>%
group_by(sex) %>%
summarise(n())
##Exporting data and Cleaning up data ----
surveys_complete <- surveys %>%
filter(species_id != "") %>% #remove missing species_id
filter(!is.na(weight)) %>% #remove N/As for weight
filter(!is.na(hindfoot_length)) %>% #remove N/As for hindfoot length
filter(sex != "")
surveys_complete <- surveys %>%
filter(species_id != "",
!is.na(weight),
!is.na(hindfoot_length),
sex != "")
#Remove species that only exist a few times
species_counts <- surveys_complete %>%
group_by(species_id) %>%
tally %>%
filter(n>=50)
#Keep most common species
surveys_common_species <- surveys_complete %>%
filter(species_id %in% species_counts$species_id) #logical operator: %in%
#Write data output to csv
write.csv(surveys_common_species,file = "data_output/surveys_complete.csv")