-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2022_11_09_jack_scratchpad.R
147 lines (133 loc) · 5.6 KB
/
2022_11_09_jack_scratchpad.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
# What this script does:
# - Roughly calculate proportion/percent of payments per building that are late
# (And output histograms to PDF if you uncomment the relevant code)
# - Determine which date should be "time zero" for longitudinal analysis
# (We decided that June 01, 2016 is a good start date)
# - Creates new columns for each transaction:
# -time elapsed since June 01, 2016 in days and in months
# -a TRUE/FALSE column indicating whether this was after start of pandemic
# Necessary packages (a subset of those required in 2022_10_12_scrape_rent_ledger)
library(pacman) # p_load()
p_load(
here,
lubridate,
janitor,
readxl,
purrr,
tidyverse
)
# This loads a .Rdata file containing the building rent payment ledgers
# generated from 2022_10_12_scrape_rent_ledger.R
load(file.path(here(),
"PovertySolution_2022",
"data",
"Receivable Analytics",
"DHC Owned and Managed",
"loaded_my_buildings.Rdata"))
customers <- unique(my_buildings[[1]]$Customer)
# Calculate the ratio of "late" to "rent" charges per customer per building
#logic - look at the charges - rent vs. late charge
library(data.table)
prop_late <- list()
for (i in 1:12) {
customers <- unique(my_buildings[[i]]$Customer)
late_over_rent <- vector()
for (j in 1:length(customers)) {
# Grab all the data for customer j in this building's ledger
cust_data <- my_buildings[[i]][my_buildings[[i]]$Customer == customers[j],]
# Look only at "legitimate" charges (i.e. no credits or adjustment-type)
rent_and_late_charges <- cust_data[(cust_data$Charge %like% "am_rent" |
cust_data$Charge %like% "am_late") &
cust_data$Charges > 0 &
!cust_data$Notes %like% "ADJ" ,1:10]
#only looking at charges that were considered rent or late charges and > 0
late_charges <- cust_data[(cust_data$Charge %like% "am_late") & cust_data$Charges > 0 ,1:10]
rent_charges <- cust_data[(cust_data$Charge %like% "am_rent") & cust_data$Charges > 0 ,1:10]
late_over_rent[j] <- dim(late_charges)[1]/dim(rent_charges)[1]
}
prop_late[[i]] <- late_over_rent
print(paste("Step",i,"done"))
}
#some individuals have more late charges than rent charges, which implies
# missing data - I'll just equate them to the most truant renters in each building
for (i in 1:12) {
prop_late[[i]][which(prop_late[[i]] > 1)] <- max(prop_late[[i]][prop_late[[i]] < 1])
}
# Obtain the file path for the folder containing household data Excel files
hh_data_folder <- file.path(here::here(), "PovertySolution_2022", "data", "Household Composition Info",
"DHC Owned and Managed")
# Get the names of files (Excel only)
hh_filenames <- list.files(hh_data_folder, pattern = '.xlsx')
# Get building names
building_names = word(hh_filenames, 1, sep = '_')
# Uncomment to save to pdf
#pdf("Percent_Late_payments.pdf")
# Make 3 x 4 grid of the next 12 plots we create
par(mfrow = c(3, 4))
for (i in 1:12) {
#these are VERY skewed distributions.
hist(100*prop_late[[i]],
main = building_names[i],
xlim = c(0, 100),
xlab = "% late rent payments")
#I think we need to use kruskal testing)
}
# Run this to stop outputting plots to pdf
#dev.off()
building_no <- vector()
proportion <- vector()
counter <- 0
for (i in 1:12) {
for (j in 1:length(prop_late[[i]])) {
counter <- counter + 1
building_no[counter] <- i
proportion[counter] <- prop_late[[i]][j]
}
}
kruskal_table <- data.frame(as.factor(building_no), proportion)
kruskal.test(prop_late)
#with a test-statistic of 86.766 and a p-value of 7.153e-14, there is
# evidence to suggest that the late payment patterns of individual buildings are not the same.
#Find the earliest time for a security deposit or am_rent
times <- list()
for (i in 1:12) {
customers <- unique(my_buildings[[i]]$Customer)
earliest <- vector(mode = "character")
for (j in 1:length(customers)) {
cust_data <- my_buildings[[i]][my_buildings[[i]]$Customer == customers[j],]
rent_and_sdep_charges <- cust_data[(cust_data$Charge %like% "am_rent" | cust_data$Charge %like% "am_sdep"), 5]
#only looking at security deposits or rent charges
earliest[j] <- as.character(sort(rent_and_sdep_charges)[1], na.rm = TRUE)
}
times[[i]] <- earliest
}
earliest_time <- vector()
building_id <- vector()
counter <- 0
for (i in 1:12) {
for (j in 1:length(times[[i]])) {
counter <- counter + 1
earliest_time[counter] <- times[[i]][j]
building_id[counter] <- i
}
}
time_table <- data.frame(building_id, earliest_time)
sort(time_table$earliest_time)[1:15]
#choosing June 2016 as the 0 time (first time there is less than a month jump to the next time)
library(lubridate)
# Calculate time passed since June 01, 2016 (in DAYS)
my_buildings2 <- my_buildings
for (i in 1:12) {
my_buildings2[[i]]$Transaction2 <- ymd(my_buildings2[[i]]$Transaction) - ymd("2016-06-01") #time difference in days of the transaction date from the "0 time"
}
# Calculate time passed since June 01, 2016 (in MONTHS)
my_buildings2 <- my_buildings
for (i in 1:12) {
my_buildings2[[i]]$Transaction_m <- interval(ymd(my_buildings2[[i]]$Transaction),
ymd("2016-06-01")) %>%
as.numeric("months") #time difference in days of the transaction date from the "0 time"
}
ymd("2020-03-01") - ymd("2016-06-01") #1369 days
for (i in 1:12) {
my_buildings2[[i]]$COVID <- my_buildings2[[i]]$Transaction2 > (ymd("2020-03-01") - ymd("2016-06-01"))
} #separating out the dates before COVID and the dates after COVID