-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path2022_10_12_scrape_rent_ledger.R
117 lines (102 loc) · 2.99 KB
/
2022_10_12_scrape_rent_ledger.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
#--- Source Necessary Packages -------------------------------------------------
library(pacman)
`%notin%` <- Negate(`%in%`)
p_load(
#--- Packages to Fit Models
MASS,
logistf,
survival,
#--- Packages to Produce Tables
gtsummary,
flextable,
janitor,
broom,
officer,
kableExtra,
#--- Packages to Produce Figures
ggsci,
ggridges,
ggthemes,
ggforce,
ggpubr,
patchwork,
grid,
gridExtra,
survminer,
viridis,
ggridges,
hrbrthemes,
stickylabeller,
#--- Packages for Data Retrieval & Pre-Processing
readxl,
here,
rdrop2,
lubridate,
zoo,
tidyverse,
purrr
)
#directory based programming
data_folder = file.path(here::here(),
'data', 'Receivable\ Analytics',
'DHC\ Owned\ and\ Managed')
my_files = list.files(data_folder, pattern = '.xlsx')
#get building names
building_names = word(my_files, 1, sep = '_')
#read in all excel files
rent_ledger = file.path(data_folder, my_files) %>%
map(., read_xlsx, col_type = 'text')
#data cleaning as a function as written in 2022_02_25.R
#replaced |> pipe with %>% pipe because I'm a bad programmer who is
#behind in R versions
clean_buildings = function(dat) {
new_dat = dat %>%
#-- Slice Off Unnecessary Header Rows and Rename Columns
slice(-1, -3, -4) %>%
row_to_names(1) %>%
#-- String Match Start of New Resident Records and Fix Variables
mutate(
NewRes = grepl("\\(", Property),
# modified to deal with Resident issues in Greenbrooke
#id those where there is a name instead of 'resident'
Identified = case_when(
grepl("\\(", Property) &
!grepl('Resident', Property) ~ 'Identified',
grepl("\\(", Property) &
grepl('Resident', Property) ~ 'Deidentified',
TRUE ~ NA_character_
),
Customer = if_else(
NewRes,
gsub(
"\\(([^()]+)\\)",
"\\1",
str_extract(Property, "\\(([^()]+)\\)")
),
NA_character_
) %>% factor(),
Transaction = Transaction %>% as.numeric() %>% as_date(origin = "1899-12-30 UTC"),
across(Charges:Balance, as.numeric)
) %>%
#--- Impute Resident T-Codes
fill(Customer, Identified) %>%
#--- Drop Extra Rows and Columns
filter(!NewRes) %>%
#drop_na(Property) %>%
select(-(Job:`Cost Code`), -Post, -NewRes) %>%
#--- Impute 'Balance Forward' with First Transaction Date
fill(Transaction, .direction = "up") %>%
#logically identify problem rows and
#replace nonmissing Tenant information that is not Resident with Resident
mutate(
Identified = case_when(Tenant != 'Resident' ~ 'Identified',
TRUE ~ Identified),
Tenant = case_when(Tenant != 'Resident' ~ 'Resident', TRUE ~
Tenant)
) %>%
#return new data
return(new_dat)
}
#apply function to list of buildings
my_buildings = rent_ledger %>% map(., clean_buildings)
my_tenants = rent_ledger %>% map_dfr(., clean_buildings)