more code, more problems

statcom-um · Dec 2, 2022 · c41ef12 · c41ef12
1 parent eb9cc0b
commit c41ef12
Show file tree

Hide file tree

Showing 14 changed files with 3,813 additions and 13 deletions.
diff --git a/code/2022_11_21_soumik.R b/code/2022_11_21_soumik.R
@@ -51,20 +51,10 @@ covariates <- read_csv(file.path(here(), "data", "created_data", "2022_07_31.csv
 
 data <- covariates %>% 
   left_join(prop) %>% 
-  left_join((my_tenants %>% select(Customer, Property) %>% rename(tid = Customer) %>% unique() %>% drop_na())) %>% 
   drop_na() 
 
-data <- data %>% select(colnames(data)[c(1, 2, 3, 4, 5, 6, 7, 11, 12, 13, 14)])  
-colnames(data) <- c("tid", "age", "sex", "num_hh", "num_dep", "num_br", "income", "rent", "total_trans", "late_trans", "property")
+data <- data %>% select(colnames(data)[c(1, 2, 3, 4, 5, 6, 7, 11, 12, 13)])  
+colnames(data) <- c("tid", "age", "sex", "num_hh", "num_dep", "num_br", "income", "rent", "total_trans", "late_trans")
 
-modified_p <- function(x, n){
-  prop.test(x, n, conf.level = 0.50)$conf.int[2]
-}
-
-data <- data %>% 
-  rowwise() %>% 
-  mutate(mod_p = modified_p(late_trans, total_trans))
+#data %>% write_csv(file.path(here(), "data", "created_data", "2022_11_22.csv"))
 
-data %>% 
-  ggplot(aes(x = property, y = mod_p, fill = property)) + 
-  geom_violin()
diff --git a/code/2022_11_30_jack.R b/code/2022_11_30_jack.R
@@ -0,0 +1,68 @@
+#an exploration of late payments and how to handle them
+
+prop_late_csv <- read.csv(file = "C:/Users/Jack Li/Downloads/2022_11_22.csv")
+
+prop_late_csv_orig <- read.csv(file = "C:/Users/Jack Li/Downloads/2022_03_21.csv")
+
+prop_late_csv$name <- rep("", 1155)
+for (i in 1:1155) {
+  prop_late_csv$name[i] <- prop_late_csv_orig$name[which(prop_late_csv_orig$tid == prop_late_csv$tid[i])]
+}
+
+prop_late_csv$proportion_late <- prop_late_csv$late_trans/prop_late_csv$total_trans #grabbing the proportion of late payments
+#there are results over 1 - we have to adjust for that.
+
+# prop_late_csv$propL_adjusted <- prop_late_csv$proportion_late
+# for (i in 1:1257) {
+#   same_household_max <- max(prop_late_csv$proportion_late[prop_late_csv$name == prop_late_csv$name[i] & prop_late_csv$proportion_late <= 1])
+#   if (prop_late_csv$proportion_late[i] > 1) {
+#     prop_late_csv$propL_adjusted[i] <- same_household_max
+#     print(paste("Changed", i, "from", prop_late_csv$proportion_late[i], "to", same_household_max))
+#   }
+#} #a crude form of adjustment that replaces any proportions over 1 with the highest proportion of late payments <=1 from the same building and does nothing else to the proportions.
+  #DON'T NEED IT ANYMORE - EVERYTHING IS ALREADY UNDER 1
+
+#Lower Wilson score intervals
+  #Wilson scores are a type of binomial confidence interval that adjust for both the observed sample size and the 
+  #we can take the lower bound of the 95% confidence Wilson interval - the lower bound allows for the benefit of the doubt for those with very few recorded payments, even if they have a decently high proportion of them.
+#in other words, it treats 1 month late, 2 months total more leniently than 15 months late, 30 total (the lower interval for the former is 0.025, the latter is 0.331).
+  #this interval can be tuned by the confidence level - if alpha = 0.05 leads to CIs that you think are too generous with their lower bound, let me know and I can make the alpha parameter make a little "more sense" - I started with alpha = 0.05 as the default 95% CI, but setting alpha = 50% instead turns 1 month late, 2 months total with a lower bound of ~0.35, for example, which may be more sensible.
+
+library(Hmisc)
+prop_late_csv$wilson_prop_late <- rep(0, 1155)
+for (i in 1:1155) {
+  x <- prop_late_csv$late_trans[i]
+  n <- prop_late_csv$total_trans[i]
+  if (x > n) n = x #if it has x late payments, there need to be at least x payments total.
+  prop_late_csv$wilson_prop_late[i] <- binconf(x, n, alpha= 0.5, method = "wilson")[2] #lower bound of interval
+  #for a select amount of observations, the wilson interval goes VERY slightly below 0 due to some precision weirdness - I bumped those up to 0 since negative proportions are theoretically impossible to accomplish.
+  if (prop_late_csv$wilson_prop_late[i] < 0 ) {
+    prop_late_csv$wilson_prop_late[i] = 0
+  }
+}
+
+library(ggplot2)
+
+#Mean, median, IQR, SD per building
+#boxplots, violin plots
+lateproportion_summary_stats <- tapply(prop_late_csv$proportion_late, prop_late_csv$name, summary)
+latewilson_summary_stats <- tapply(prop_late_csv$wilson_prop_late, prop_late_csv$name, summary)
+saveRDS(lateproportion_summary_stats, file = "C:/Users/Jack Li/Downloads/raw_late_proportion_summary_stats.RDS")
+saveRDS(latewilson_summary_stats, file = "C:/Users/Jack Li/Downloads/wilson_late_proportion_summary_stats.RDS")
+library(table1)
+
+table1(~ age + factor(sex) + income + num_dep + rent + proportion_late + wilson_prop_late| name, data = prop_late_csv)
+library(flextable)
+library(dplyr)
+tbl1 <- table1(~ age + factor(sex) + income + num_dep + rent + proportion_late + wilson_prop_late| name, data = prop_late_csv)
+write.csv(tbl1, file = "C:/Users/Jack Li/Downloads/STATCOM_prop_late_summary_statistics.csv")
+
+windows()
+ggplot(data = prop_late_csv, aes(x = name, y = proportion_late)) + geom_violin() +theme_classic(base_size = 10) + geom_boxplot(width = 0.1) + ylim(0,1)
+
+windows()
+ggplot(data = prop_late_csv, aes(x = name, y = wilson_prop_late)) + geom_violin() +theme_classic(base_size = 10) + geom_boxplot(width = 0.1) + ylim(0, 1) #a rudimentary violin plot of results when using Wilson scores.
+
+#you'll notice in the summary statistics and in the violin plots that using the Wilson score has the tendency to shrink most proportions to zero (by nature of using the lower bound instead of the point estimate) - this could lead to some trickiness with interpretation as we are using a lower bound of a confidence interval as our response variable, rather than a point estimate. Nonetheless, if we plan to categorize tenants by quartiles and/or in a way that is relative to each other, this should not be a major issue.
+
+#I am more partial to the Wilson score implementation since it can take into account different sample sizes in each point estimate, and I think I can adjust it easily to be more useful, but I do understand how shifting the estimates down is troublesome and how it may be a bit harder to explain - let me know what you think.
diff --git a/code/2022_12_02.R b/code/2022_12_02.R
@@ -0,0 +1,119 @@
+library(pacman)
+
+`%notin%` <- Negate(`%in%`)
+
+p_load(
+  #--- Packages to Fit Models
+  MASS, logistf, survival, fastR2, 
+  #--- Packages to Produce Tables
+  gtsummary, flextable, janitor, broom, officer, kableExtra, reactable, broom.mixed, lmerTest,
+  #--- Packages to Produce Figures
+  crayon, ggsci, ggridges, ggthemes, ggforce, ggpubr, patchwork, grid, gridExtra, plotly,  survminer, viridis, ggridges, hrbrthemes, stickylabeller, latex2exp, scales, glue, 
+  #--- Packages for Data Retrieval & Pre-Processing
+  readxl, here, rdrop2, lubridate, zoo, tidyverse, purrr, data.table, stringr, tidyr, table1
+)
+
+y <- read_csv(file.path(here(), "data", "created_data", "2022_11_21.csv")) %>% 
+  group_by(Customer) %>% 
+  summarise(tid = unique(Customer), total = n(), late = sum(!is.na(lateFees))) %>% 
+  select(tid, total, late) %>% 
+  rename(n_total = "total", 
+         n_late = "late")
+
+covariates <- read_csv(file.path(here(), "data", "created_data", "2022_07_31.csv")) %>% 
+  rename(age = "Age on effective date of action", 
+         sex = "Sex", 
+         n_hh = "Total number in household", 
+         n_dep = "Number of dependents", 
+         n_br = "Number of bedrooms in unit", 
+         income = "Adjusted annual income: 8a minus 8x (if 8x is larger, put 0)", 
+         rent = "Tenant rent: 10d minus 10en If positive or 0, put tenant rent If negative, credit tenant") %>% 
+  select(c(tid, age, sex, n_hh, n_dep, n_br, income, rent))
+
+## some weird duplication issue, dropping duplicates for now
+covariates <- covariates %>% 
+  filter(tid %notin% 
+           (covariates %>% group_by(tid) %>% summarise(n = n()) %>% filter(n > 1) %>% pull(tid)))
+
+buildings <- read_csv(file.path(here(), "data", "created_data", "2022_03_21.csv")) %>% 
+  select(c(tid, name, late_duration, late_amount))
+
+data <- covariates %>% inner_join(y) %>% inner_join(buildings) %>% rowwise() %>% 
+  mutate(n_hh = case_when(n_hh > 3 ~ factor("> 3",  levels = c("1", "2", "3", "> 3")),
+                          TRUE ~ factor(n_hh, levels = c("1", "2", "3", "> 3"))), 
+         n_dep = case_when(TRUE ~ factor(n_dep, levels = c("0", "1", "2"))),
+         n_br = case_when(n_br >= 3 ~ factor(">=3",  levels = c("1", "2", ">=3")),
+                          TRUE ~ factor(n_br, levels = c("1", "2", ">=3"))))
+
+data <- data %>% drop_na() %>% 
+  rowwise() %>%
+  mutate(prop = wilson.ci(x = n_late, n = n_total, conf.level = 0.50)[1]) %>% 
+  mutate(prop = case_when(prop < 0 ~ 0, 
+                          prop > 1 ~ 1, 
+                          TRUE ~ prop))
+
+table1(~ age + sex + n_hh + n_dep + n_br + income| name, data %>% drop_na(), 
+       render.continuous = c(.="Mean (SD)", 
+                             .="Median (IQR)", 
+                             .="[Min, Max]")) 
+
+ggplotly(data %>%  
+  ggplot(aes(y = name, x = age, fill = name)) + 
+  geom_violin(width=1.4) +
+  geom_boxplot(width=0.1, color="black") +
+  theme_bw() + 
+  xlab("Age (years)") + 
+  ylab("") + 
+  theme(legend.position = "bottom") + 
+  labs(fill = "Building"))
+
+data %>% 
+  group_by(name, sex) %>% 
+  summarize(n = n()) %>%
+  drop_na() %>% 
+  ggplot(aes(fill = sex, x = name, y = n)) + 
+  geom_bar(position="fill", stat="identity") +
+  theme_bw() + 
+  ylab("Fraction") + 
+  xlab("") + 
+  theme(legend.position = "bottom",
+        legend.box="vertical", legend.margin=margin(),
+        axis.text.x = element_text(face = "bold", size = 18),
+        axis.text.y = element_text(face = "bold", size = 18),
+        plot.title = element_text(face = "bold", size = 18),
+        plot.subtitle = element_text(size = 14),
+        #axis.text = element_text(size = 12),
+        axis.title = element_text(face = "bold", size = 18),
+        legend.title = element_text(face = "bold", size = 18),
+        legend.text = element_text(size = 18)) + 
+  labs(fill = "Sex") + 
+  coord_flip()
+
+
+data %>% 
+  group_by(name, n_hh) %>% 
+  summarize(n = n()) %>% 
+  ungroup() %>% 
+  drop_na() %>% 
+  ggplot(aes(fill = n_hh, x = name, y = n)) + 
+  geom_bar(position="fill", stat="identity") +
+  theme_bw() + 
+  ylab("Fraction") + 
+  xlab("") + 
+  theme(legend.position = "bottom",
+        legend.box="vertical", legend.margin=margin(),
+        axis.text.x = element_text(face = "bold", size = 18),
+        axis.text.y = element_text(face = "bold", size = 18),
+        plot.title = element_text(face = "bold", size = 18),
+        plot.subtitle = element_text(size = 14),
+        #axis.text = element_text(size = 12),
+        axis.title = element_text(face = "bold", size = 18),
+        legend.title = element_text(face = "bold", size = 18),
+        legend.text = element_text(size = 18)) + 
+  labs(fill = "Sex") + 
+  coord_flip()
+
+op <- tidy(lmerTest::lmer(prop ~ -1 + age + (1 | name), data = data))
+op[-c(nrow(op)-1, nrow(op)), ] %>% kbl() %>% kable_paper("hover", full_width = T)
+
+