-
Notifications
You must be signed in to change notification settings - Fork 137
/
Copy path8-variable.r
53 lines (40 loc) · 1.75 KB
/
8-variable.r
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
library(ggplot2)
library(plyr)
# Try and find names that vary a lot between the states.
bnames <- read.csv("baby-names-by-state.csv", stringsAsFactors = F)
bnames <- subset(bnames, !is.na(number))
bnames$state <- factor(bnames$state)
# Add number of births and convert to proportion
births <- read.csv("births.csv")
bnames <- merge(bnames, births, by = c("state", "year", "sex"))
bnames$prop <- bnames$number / bnames$births
# Extract only names that have appeared in at least 25% of possible years
# and states
bnames$namesex <- paste(bnames$name, bnames$sex, sep = "-")
counts <- ddply(bnames, c("namesex"), summarise,
n = length(namesex),
number = sum(number))
counts <- counts[order(-counts$number), ]
counts <- subset(counts, n > 1250 * 0.25 & number > 1e5)
top <- subset(bnames, namesex %in% counts$namesex)
show_name <- function(name) {
one <- top[top$namesex == name, ]
qplot(year, prop, data = one, geom = "line", group = state)
}
# Look for names where there is a lot of variation in pattern between states
# Correlation approach
bystate <- cast(top, namesex + year ~ state, value = "prop")
cors <- dlply(bystate, "namesex", function(df)
cor(as.matrix(df[, -(1:2)]), use = "pairwise.complete.obs"))
arrange(ldply(cors, min, na.rm = T), V1)
# Modelling approach - seems to do much better
patterns <- dlply(top, c("namesex"), function(df) {
lm(prop ~ factor(year), data = df, weight = sqrt(births))
}, .progress = "text")
rsq <- function(mod) {
summarise(summary(mod), rsq = r.squared, sigma = sigma)
}
qual <- arrange(merge(ldply(patterns, rsq), counts), -rsq)
sub <- c(as.character(qual$namesex[seq(1, nrow(qual), by = 5)]), "Juan-boy")
interesting <- subset(bnames, namesex %in% sub)
write.table(interesting, "interesting-names.csv", sep = ",", row = F)