more commits more problems urgh

statcom-um · Nov 21, 2022 · eb9cc0b · eb9cc0b
1 parent 415ebfc
commit eb9cc0b
Show file tree

Hide file tree

Showing 54 changed files with 58,965 additions and 3,085 deletions.
diff --git a/code/2022_10_12_rent_summary.R b/code/2022_10_12_rent_summary.R
@@ -51,24 +51,24 @@ source(file.path(here(), "code", "2022_10_12_scrape_rent_ledger.R"))
 
 ## find out how many times tenant had late payment
 clean_rent_late_count = function(dat) {
-  return(left_join(dat |> 
-                     filter(grepl("Tenant Rent", Charge)) |>
-                     filter(Charges > 0) |>
-                     rowwise() |>
-                     mutate(late_by = day(Transaction)) |>
-                     group_by(Customer) |>
+  return(left_join(dat %>% 
+                     filter(grepl("Tenant Rent", Charge)) %>%
+                     filter(Charges > 0) %>%
+                     rowwise() %>%
+                     mutate(late_by = day(Transaction)) %>%
+                     group_by(Customer) %>%
                      summarise(total = n()), 
-                   dat |> 
-                     filter(grepl("Tenant Late Charge", Charge)) |>
-                     rowwise() |>
-                     mutate(late_by = day(Transaction)) |>
-                     group_by(Customer) |>
+                   dat %>% 
+                     filter(grepl("Tenant Late Charge", Charge)) %>%
+                     rowwise() %>%
+                     mutate(late_by = day(Transaction)) %>%
+                     group_by(Customer) %>%
                      summarise(late = n()) 
   ))
 }
 
 late_count <- my_buildings %>% 
-  map_dfr(., clean_rent_late_count) |>
+  map_dfr(., clean_rent_late_count) %>%
   mutate(late = ifelse(is.na(late), 0, late))
 
 tenants = my_buildings[[1]]
@@ -80,32 +80,32 @@ for(i in 2:12){
 ## find out when tenant paid in same month after late fees added
 clean_rent_late_count = function(id) {
 
-  dat <- tenants |>
-    filter(Customer == id) |>
+  dat <- tenants %>%
+    filter(Customer == id) %>%
     mutate(index = seq_along(Charge)) 
 
-  pos <- dat |> 
-    filter(grepl("Tenant Late Charge", Charge)) |>
+  pos <- dat %>% 
+    filter(grepl("Tenant Late Charge", Charge)) %>%
     pull(index) + 1
 
-  month <- dat |> 
-    filter(grepl("Tenant Late Charge", Charge)) |>
-    select(c(index, Transaction, Receipts, Charge, Balance)) |>
-    mutate(month = lubridate::month(Transaction)) |>
-    select(c(index, Transaction, month, Charge)) |>
+  month <- dat %>% 
+    filter(grepl("Tenant Late Charge", Charge)) %>%
+    select(c(index, Transaction, Receipts, Charge, Balance)) %>%
+    mutate(month = lubridate::month(Transaction)) %>%
+    select(c(index, Transaction, month, Charge)) %>%
     rename(month_rent = month)
 
-  payment <- dat |> 
-    select(c(index, Transaction, Receipts)) |>
-    filter(Receipts < 0) |>
+  payment <- dat %>% 
+    select(c(index, Transaction, Receipts)) %>%
+    filter(Receipts < 0) %>%
     mutate(index = index - 1, 
            month_pay = lubridate::month(Transaction))
 
-  same <- left_join(payment, month, by = "index") |>
-    filter(month_pay == month_rent) |> 
+  same <- left_join(payment, month, by = "index") %>%
+    filter(month_pay == month_rent) %>% 
     mutate(late_duration = as.numeric(Transaction.x - Transaction.y), 
-           rent = -Receipts) |>
-    select(c(index, late_duration, rent, Transaction.x)) |>
+           rent = -Receipts) %>%
+    select(c(index, late_duration, rent, Transaction.x)) %>%
     rename(date = Transaction.x)
 
   if(nrow(same) == 0){
@@ -121,29 +121,29 @@ for(id in unique(tenants$Customer)[-1]){
                clean_rent_late_count(as.character(id)))
 }
 
-dat <- full_join(as_tibble(same) |>
+dat <- full_join(as_tibble(same) %>%
                    mutate(V2 = as.numeric(V2), 
                           V3 = as.numeric(V3), 
-                          V4 = as.numeric(V4)) |>
+                          V4 = as.numeric(V4)) %>%
                    rename(same_times = V2, 
                           median_late_duration = V3, 
                           mean_late_amount = V4, 
                           tid = V1), 
-                 late_count |> rename(tid = Customer)) |>
+                 late_count %>% rename(tid = Customer)) %>%
   rename(late_same = same_times, 
          late_duration = median_late_duration, 
-         late_amount = mean_late_amount) |>
+         late_amount = mean_late_amount) %>%
   select(c(tid, total, late, late_same, late_duration, late_amount))
 
-dat <- left_join(tenants |> 
-                   rename(tid = Customer, location = Property) |> 
-                   select(c(location, tid)) |> 
-                   group_by(tid) |> 
+dat <- left_join(tenants %>% 
+                   rename(tid = Customer, location = Property) %>% 
+                   select(c(location, tid)) %>% 
+                   group_by(tid) %>% 
                    summarise(tid = unique(tid), 
-                             location = unique(location)) |>
-                   filter(location != "Grand Total") |>
+                             location = unique(location)) %>%
+                   filter(location != "Grand Total") %>%
                    drop_na(), 
-                 dat) |>
+                 dat) %>%
   drop_na()
 
 loc_code = dat %>% pull(location) %>% unique()
@@ -199,7 +199,7 @@ dat  %>%
 ## scatterplot of delay in rent payment vs amount of rent due
 dat  %>%
   filter(late_same > 0, late_amount > 0, late_duration > 0) %>% 
-  ggplot(aes(x = late_duration, y = late_amount, color = name)) +
+  ggplot(aes(x = late_duration, y = late_amount, color = name)) + 
   geom_point() +
   scale_fill_futurama() +
   theme_bw() +

diff --git a/code/2022_10_12_scrape_rent_ledger.R b/code/2022_10_12_scrape_rent_ledger.R
@@ -1,5 +1,6 @@
 #--- Source Necessary Packages -------------------------------------------------
 
+
 library(pacman)
 
 `%notin%` <- Negate(`%in%`)
@@ -49,16 +50,15 @@ p_load(
 
 #directory based programming
 data_folder = file.path(here::here(),
-                        'data',
-                        'Receivable\ Analytics',
+                        'data', 'Receivable\ Analytics',
                         'DHC\ Owned\ and\ Managed')
 my_files = list.files(data_folder, pattern = '.xlsx')
 
 #get building names
 building_names = word(my_files, 1, sep = '_')
 
 #read in all excel files
-my_buildings = file.path(data_folder, my_files) %>%
+rent_ledger = file.path(data_folder, my_files) %>%
   map(., read_xlsx, col_type = 'text')
 
 #data cleaning as a function as written in 2022_02_25.R
@@ -90,7 +90,7 @@ clean_buildings = function(dat) {
         ),
         NA_character_
       ) %>% factor(),
-      Transaction = Transaction |> as.numeric() |> as_date(origin = "1899-12-30 UTC"),
+      Transaction = Transaction %>% as.numeric() %>% as_date(origin = "1899-12-30 UTC"),
       across(Charges:Balance, as.numeric)
     ) %>%
     #--- Impute Resident T-Codes
@@ -113,4 +113,5 @@ clean_buildings = function(dat) {
     return(new_dat)
 }
 #apply function to list of buildings
-my_buildings = my_buildings %>% map(., clean_buildings)
+my_buildings = rent_ledger %>% map(., clean_buildings)
+my_tenants = rent_ledger %>% map_dfr(., clean_buildings)
diff --git a/code/2022_11_09_jack_scratchpad.R b/code/2022_11_09_jack_scratchpad.R
@@ -0,0 +1,147 @@
+# What this script does:
+# - Roughly calculate proportion/percent of payments per building that are late
+#   (And output histograms to PDF if you uncomment the relevant code)
+# - Determine which date should be "time zero" for longitudinal analysis
+#   (We decided that June 01, 2016 is a good start date)
+# - Creates new columns for each transaction: 
+#     -time elapsed since June 01, 2016 in days and in months
+#     -a TRUE/FALSE column indicating whether this was after start of pandemic
+
+# Necessary packages (a subset of those required in 2022_10_12_scrape_rent_ledger)
+library(pacman) # p_load()
+p_load(
+  here,
+  lubridate,
+  janitor,
+  readxl,
+  purrr,
+  tidyverse
+)
+
+# This loads a .Rdata file containing the building rent payment ledgers 
+# generated from 2022_10_12_scrape_rent_ledger.R
+load(file.path(here(), 
+               "PovertySolution_2022", 
+               "data", 
+               "Receivable Analytics", 
+               "DHC Owned and Managed",
+               "loaded_my_buildings.Rdata"))
+customers <- unique(my_buildings[[1]]$Customer)
+
+# Calculate the ratio of "late" to "rent" charges per customer per building
+#logic - look at the charges - rent vs. late charge
+library(data.table)
+prop_late <- list()
+for (i in 1:12) {
+  customers <- unique(my_buildings[[i]]$Customer)
+  late_over_rent <- vector()
+  for (j in 1:length(customers)) {
+    # Grab all the data for customer j in this building's ledger
+    cust_data <- my_buildings[[i]][my_buildings[[i]]$Customer == customers[j],]
+    # Look only at "legitimate" charges (i.e. no credits or adjustment-type)
+    rent_and_late_charges <- cust_data[(cust_data$Charge %like% "am_rent" | 
+                                          cust_data$Charge %like% "am_late") & 
+                                         cust_data$Charges > 0 & 
+                                         !cust_data$Notes %like% "ADJ" ,1:10]
+    #only looking at charges that were considered rent or late charges and > 0
+    late_charges <- cust_data[(cust_data$Charge %like% "am_late") & cust_data$Charges > 0 ,1:10]
+    rent_charges <- cust_data[(cust_data$Charge %like% "am_rent") & cust_data$Charges > 0 ,1:10]
+    late_over_rent[j] <- dim(late_charges)[1]/dim(rent_charges)[1]
+  }
+  prop_late[[i]] <- late_over_rent
+  print(paste("Step",i,"done"))
+}
+
+#some individuals have more late charges than rent charges, which implies 
+# missing data - I'll just equate them to the most truant renters in each building
+for (i in 1:12) {
+    prop_late[[i]][which(prop_late[[i]] > 1)] <- max(prop_late[[i]][prop_late[[i]] < 1])
+}
+
+# Obtain the file path for the folder containing household data Excel files
+hh_data_folder <- file.path(here::here(), "PovertySolution_2022", "data", "Household Composition Info",
+                            "DHC Owned and Managed")
+
+# Get the names of files (Excel only)
+hh_filenames <- list.files(hh_data_folder, pattern = '.xlsx')
+
+# Get building names
+building_names = word(hh_filenames, 1, sep = '_')
+
+# Uncomment to save to pdf
+#pdf("Percent_Late_payments.pdf") 
+# Make 3 x 4 grid of the next 12 plots we create
+par(mfrow = c(3, 4))
+for (i in 1:12) {
+  #these are VERY skewed distributions.
+  hist(100*prop_late[[i]], 
+       main = building_names[i], 
+       xlim = c(0, 100),
+       xlab = "% late rent payments")
+#I think we need to use kruskal testing)
+}
+# Run this to stop outputting plots to pdf
+#dev.off()
+
+building_no <- vector()
+proportion <- vector()
+counter <- 0
+for (i in 1:12) {
+  for (j in 1:length(prop_late[[i]])) {
+    counter <- counter + 1
+    building_no[counter] <- i
+    proportion[counter] <- prop_late[[i]][j]
+  }
+}
+kruskal_table <- data.frame(as.factor(building_no), proportion)
+
+kruskal.test(prop_late)
+#with a test-statistic of 86.766 and a p-value of 7.153e-14, there is 
+# evidence to suggest that the late payment patterns of individual buildings are not the same.
+
+#Find the earliest time for a security deposit or am_rent
+times <- list()
+for (i in 1:12) {
+  customers <- unique(my_buildings[[i]]$Customer)
+  earliest <- vector(mode = "character")
+  for (j in 1:length(customers)) {
+    cust_data <- my_buildings[[i]][my_buildings[[i]]$Customer == customers[j],]
+    rent_and_sdep_charges <- cust_data[(cust_data$Charge %like% "am_rent" | cust_data$Charge %like% "am_sdep"), 5]
+    #only looking at security deposits or rent charges
+    earliest[j] <- as.character(sort(rent_and_sdep_charges)[1], na.rm = TRUE)
+  }
+  times[[i]] <- earliest
+}
+
+earliest_time <- vector()
+building_id <- vector()
+counter <- 0
+for (i in 1:12) {
+  for (j in 1:length(times[[i]])) {
+    counter <- counter + 1
+    earliest_time[counter] <- times[[i]][j]
+    building_id[counter] <- i
+  }
+}
+time_table <- data.frame(building_id, earliest_time)
+sort(time_table$earliest_time)[1:15] 
+#choosing June 2016 as the 0 time (first time there is less than a month jump to the next time)
+
+library(lubridate)
+# Calculate time passed since June 01, 2016 (in DAYS)
+my_buildings2 <- my_buildings
+for (i in 1:12) {
+  my_buildings2[[i]]$Transaction2 <- ymd(my_buildings2[[i]]$Transaction) - ymd("2016-06-01") #time difference in days of the transaction date from the "0 time"
+}
+
+# Calculate time passed since June 01, 2016 (in MONTHS)
+my_buildings2 <- my_buildings
+for (i in 1:12) {
+  my_buildings2[[i]]$Transaction_m <- interval(ymd(my_buildings2[[i]]$Transaction),  
+                                              ymd("2016-06-01")) %>% 
+    as.numeric("months") #time difference in days of the transaction date from the "0 time"
+}
+ymd("2020-03-01") - ymd("2016-06-01") #1369 days
+for (i in 1:12) {
+  my_buildings2[[i]]$COVID <- my_buildings2[[i]]$Transaction2 > (ymd("2020-03-01") - ymd("2016-06-01"))
+} #separating out the dates before COVID and the dates after COVID
diff --git a/code/2022_11_21_jack_scratchpad.R b/code/2022_11_21_jack_scratchpad.R
@@ -0,0 +1,51 @@
+#an exploration of late payments and how to handle them
+
+prop_late_csv <- read.csv(file = "C:/Users/Jack Li/Downloads/2022_03_21.csv")
+
+prop_late_csv$proportion_late <- prop_late_csv$late/prop_late_csv$total #grabbing the proportion of late payments
+#there are results over 1 - we have to adjust for that.
+
+prop_late_csv$propL_adjusted <- prop_late_csv$proportion_late
+for (i in 1:1257) {
+  same_household_max <- max(prop_late_csv$proportion_late[prop_late_csv$name == prop_late_csv$name[i] & prop_late_csv$proportion_late <= 1])
+  if (prop_late_csv$proportion_late[i] > 1) {
+    prop_late_csv$propL_adjusted[i] <- same_household_max
+    print(paste("Changed", i, "from", prop_late_csv$proportion_late[i], "to", same_household_max))
+  }
+} #a crude form of adjustment that replaces any proportions over 1 with the highest proportion of late payments <=1 from the same building and does nothing else to the proportions.
+
+#Lower Wilson score intervals
+  #Wilson scores are a type of binomial confidence interval that adjust for both the observed sample size and the 
+  #we can take the lower bound of the 95% confidence Wilson interval - the lower bound allows for the benefit of the doubt for those with very few recorded payments, even if they have a decently high proportion of them.
+#in other words, it treats 1 month late, 2 months total more leniently than 15 months late, 30 total (the lower interval for the former is 0.025, the latter is 0.331).
+  #this interval can be tuned by the confidence level - if alpha = 0.05 leads to CIs that you think are too generous with their lower bound, let me know and I can make the alpha parameter make a little "more sense" - I started with alpha = 0.05 as the default 95% CI, but setting alpha = 50% instead turns 1 month late, 2 months total with a lower bound of ~0.35, for example, which may be more sensible.
+
+library(Hmisc)
+prop_late_csv$wilson_prop_late <- rep(0, 1257)
+for (i in 1:1257) {
+  x <- prop_late_csv$late[i]
+  n <- prop_late_csv$total[i]
+  if (x > n) n = x #if it has x late payments, there need to be at least x payments total.
+  prop_late_csv$wilson_prop_late[i] <- binconf(x, n, alpha= 0.05, method = "wilson")[2] #lower bound of interval
+  #for a select amount of observations, the wilson interval goes VERY slightly below 0 due to some precision weirdness - I bumped those up to 0 since negative proportions are theoretically impossible to accomplish.
+  if (prop_late_csv$wilson_prop_late[i] < 0 ) {
+    prop_late_csv$wilson_prop_late[i] = 0
+  }
+}
+
+library(ggplot2)
+
+#Mean, median, IQR, SD per building
+#boxplots, violin plots
+tapply(prop_late_csv$propL_adjusted, prop_late_csv$name, summary)
+tapply(prop_late_csv$wilson_prop_late, prop_late_csv$name, summary)
+
+windows()
+ggplot(data = prop_late_csv, aes(x = name, y = propL_adjusted)) + geom_violin() +theme_classic(base_size = 10) + geom_boxplot(width = 0.1) + ylim(0,1) #a rudimentary violin plot of results when simply pushing everything above 1 below to the maximum value <= 1
+
+windows()
+ggplot(data = prop_late_csv, aes(x = name, y = wilson_prop_late)) + geom_violin() +theme_classic(base_size = 10) + geom_boxplot(width = 0.1) + ylim(0, 1) #a rudimentary violin plot of results when using Wilson scores.
+
+#you'll notice in the summary statistics and in the violin plots that using the Wilson score has the tendency to shrink most proportions to zero (by nature of using the lower bound instead of the point estimate) - this could lead to some trickiness with interpretation as we are using a lower bound of a confidence interval as our response variable, rather than a point estimate. Nonetheless, if we plan to categorize tenants by quartiles and/or in a way that is relative to each other, this should not be a major issue.
+
+#I am more partial to the Wilson score implementation since it can take into account different sample sizes in each point estimate, and I think I can adjust it easily to be more useful, but I do understand how shifting the estimates down is troublesome and how it may be a bit harder to explain - let me know what you think.