-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmain.py
281 lines (230 loc) · 11.7 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
from datetime import datetime
import json
from pathlib import Path
import shutil
import typing
import pandas as pd
POC: Path
"""Pre-pended to all of the configs ending with `_PATH`
to point to the correct path for inputs/outputs for each of the pocs."""
INPUT_BANK_ACCOUNTS_DATA_PATH: typing.Final = Path("./inputs/bankaccounts.json")
INPUT_EMPLOYEE_DATA_PATH: typing.Final = Path("./inputs/ee_data.json")
INPUT_PAYRUN_DATA_PATH: typing.Final = Path("./inputs/payrun_data.json")
INPUT_PAYRUN_MAPPING_PATH: typing.Final = Path("./mapping.json")
INPUT_PAYRUN_XLSX_PATH: typing.Final = Path("./inputs/payrun_file.xlsx")
OUT_DATE_FORMAT: typing.Final = "%d.%m.%Y" # being just '%y' breaks work start dates at year 1900
OUT_TEMPLATE_PATH: typing.Final = Path("./template/TEST PAYROLL INSTRUCTIONS 12 2022 NEW Employee Form.xlsx")
OUT_RESULT_PATH: typing.Final = Path("./Employee Form.xlsx")
class OutLocation(typing.TypedDict):
"""The `sheet` and top-left cell coordinates, where a given `DataFrame` would be written in some Excel file."""
sheet: str
row: int
col: int
OUT_EMPLOYEES_INFO_LOCATION: typing.Final[OutLocation] = {"sheet": "Employees", "row": 0, "col": 1}
OUT_EMPLOYEES_GREEN_LOCATION: typing.Final[OutLocation] = {"sheet": "Employees", "row": 5, "col": 0}
OUT_EMPLOYEES_RED_LOCATION: typing.Final[OutLocation] = {"sheet": "Employees", "row": 5, "col": 14}
OUT_EMPLOYEE_FORM_LOCATION: typing.Final[OutLocation] = {"sheet": "Employee form", "row": 2, "col": 0}
def read_bank_accounts(*, employee_ids: pd.Index, payrun_meta: dict[str, typing.Any]) -> pd.DataFrame:
"""Read the `bankaccounts.json` file."""
def select_row_from_duplicates(rows: pd.DataFrame) -> pd.DataFrame:
if len(rows) == 1:
return rows
# NOTE: would also include other filters / bussiness logic, like e.g.:
# - rows["bankCountry"] == payrun_meta["countryAbbr"]
# - rows["primaryBankAccount"] == True
# but no clue what the actual requirements are and don't want to spend time just imagining something
rows = rows[~rows["iban"].isna()]
return rows
df_ba = pd.read_json(POC.joinpath(INPUT_BANK_ACCOUNTS_DATA_PATH)).set_index("employee")
df_ba = df_ba[(df_ba.index.isin(employee_ids)) & (df_ba["company"] == payrun_meta["companyId"])]
df_ba = df_ba.groupby(df_ba.index.name, sort=False).apply(select_row_from_duplicates).droplevel(level=0)
return df_ba.reindex(employee_ids, axis="index").sort_index()
def read_employees(*, employee_ids: pd.Index, payrun_meta: dict[str, typing.Any]) -> pd.DataFrame:
"""Read the `ee_data.json` file."""
# NOTE: from what I can see, filters (including thiese two) are not needed
# with the provided test-data, in practice - possibly more filters?
def in_company(companies: pd.Series) -> pd.Series:
return pd.Series(
index = companies.index,
data = [
ref['id'] == payrun_meta["companyId"]
for ref in companies
],
)
def in_country(countries: pd.Series) -> pd.Series:
return pd.Series(
index = countries.index,
data=[
# because I didn't see 'country_code' in 'payrun_data.json':
ref['name'] == payrun_meta["country"] and ref["abbreviature"] == payrun_meta["countryAbbr"]
for ref in countries
]
)
df_ee = pd.read_json(POC.joinpath(INPUT_EMPLOYEE_DATA_PATH)).set_index("id").filter(employee_ids, axis="index")
df_ee = df_ee[
(in_company(df_ee['companyRef'])) & (in_country(df_ee['countryRef']))
]
df_ee.index.set_names("id", inplace=True)
return df_ee.sort_index()
def read_payrun_data() -> tuple[dict[str, typing.Any], pd.DataFrame, pd.DataFrame]:
"""Read the `payrun_data.json` file.
:return: a `tuple` of three elements:
- the payrun's metadata as a `dict`
- the payrun's data as a `pd.DataFrame`
- the payrun's global mappings as a `pd.DataFrame`
"""
# TODO: this seems a non-optimal approach:
with open(POC.joinpath(INPUT_PAYRUN_DATA_PATH), 'r') as f:
payrun_data: dict[str, typing.Any] = json.load(f)
df_pr_data = pd.DataFrame(payrun_data.pop("payrun_data"))
df_pr_map = pd.DataFrame(payrun_data.pop("mappings"))
return payrun_data, df_pr_data, df_pr_map
def read_payrun_data_mapping() -> pd.DataFrame:
"""Read the `mapping.json` file.
Create an empty scaffold if the file doesn't already exist and
proceed with implicitly empty `globalName` (e.g. `null`, `""`) for all of the `templateName`s.
Filling some/all of the mappings in the generated file and re-running this script will fill the output columns.
Mappings for `templateName`s that don't exist as columns in the current template are completely ignorred.
Duplicate mappings, each defining a different `globalName` for the same (existing) `templateName` are allowed,
but only the `globalName` of the last mapping parsed from the file is used.
Missing mappings for one or more `templateName`s implicitly specify an empty `globalName` (e.g. `null`, `""`)
for that `templateName` and will just show up in the result as `None` values. Mappings explicitly specifying
an empty `globalName` (e.g. `null`, `""`) show up in the result the same way.
"""
mapping_path = POC.joinpath(INPUT_PAYRUN_MAPPING_PATH)
col_names = pd.read_excel(
POC.joinpath(OUT_TEMPLATE_PATH),
sheet_name=OUT_EMPLOYEES_INFO_LOCATION["sheet"],
# TODO: I suppose following three args can be more dynamic:
usecols='O:Y',
header=4,
nrows=1,
).columns
if mapping_path.exists():
df_pr_map = (pd
.read_json(mapping_path)
.drop_duplicates(subset="templateName", keep="last")
.set_index("templateName")
.filter(col_names, axis="index")
)
else:
template = [{"globalName": None, "templateName": col} for col in col_names]
with open(mapping_path, 'w') as f:
json.dump(template, f, indent=4)
df_pr_map = pd.DataFrame(template)
# TODO: calc for this can be more dynamic:
df_pr_map["colPos"] = range(ord("O") - ord("A"), ord("Y") - ord("A") + 1)
return df_pr_map
def read_payrun_excel() -> tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
"""Read the 4 tabs/pages of the `payrun_file.xlsx` file, each in its own `DataFrame`."""
pr_xls_file = pd.ExcelFile(POC.joinpath(INPUT_PAYRUN_XLSX_PATH))
main = pd.read_excel(pr_xls_file, "Payrun file", header=1)
main = main.loc[~main['System Employee ID'].isna()].set_index('System Employee ID')
changes = pd.read_excel(pr_xls_file, "P&T Changes", header=1)
notes = pd.read_excel(pr_xls_file, "Notes", header=0)
updates = pd.read_excel(pr_xls_file, "General updates", header=0)
return main.sort_index(), changes, notes, updates
def create_out_employees_info(payrun_meta: dict[str, typing.Any]) -> pd.DataFrame:
"""Data for the top-left cells (column 'B') in the `Employees` sheet in the Excel output."""
keys = ("company", "payDate", "month")
values = []
for k in keys:
v = payrun_meta.get(k)
if v is None:
raise KeyError(f"Could not find required key '{k}' in file '{POC.joinpath(INPUT_PAYRUN_DATA_PATH)}'")
values.append(v)
values[1] = datetime.strptime(payrun_meta["payDate"], "%d/%b/%Y").strftime(OUT_DATE_FORMAT)
return pd.DataFrame(values)
def create_out_employees_green(df_pr_xls_main: pd.DataFrame, df_ee: pd.DataFrame, df_ba: pd.DataFrame) -> pd.DataFrame:
"""Data for the green columns in the `Employees` sheet in the Excel output."""
df = pd.DataFrame([
df_pr_xls_main.index,
df_ee["fullname"],
df_ee["position"],
df_pr_xls_main["Department"], # or from df_ee["department"], or df_ee["department"] as a fallback?
df_ba["accountNumber"],
df_ba["iban"],
df_ee["workType"], # 'Percentage of employment'?
df_pr_xls_main["Annual Salary"],
pd.to_datetime(df_pr_xls_main["Effective Date"]).dt.strftime(OUT_DATE_FORMAT), # 'Start date for new yearly salary'?
df_pr_xls_main.filter(like="Car Allowance", axis=1).iloc[:, 0],
df_pr_xls_main.filter(like="Car Value", axis=1).iloc[:, 0],
df_ee["rgNumber"], # 'Registration Number'? - couldn't find anything else
df_ee["startDate"].apply(lambda x:
None if x["date"] is None
else datetime.fromisoformat(x["date"]).date().strftime(OUT_DATE_FORMAT) # validate 'not after current payrun'?
),
pd.Series([None] * len(df_pr_xls_main.index)), # no clue what the 'Comments' col is supposed to be
]).T.set_axis(df_pr_xls_main.index, axis="index")
return df.sort_index()
def create_out_employees_red(
df_pr_xls_main: pd.DataFrame,
df_pr_data: pd.DataFrame,
df_pr_map: pd.DataFrame,
df_pr_map_local: pd.DataFrame,
) -> pd.DataFrame:
"""Data for the red columns in the `Employees` sheet in the Excel output."""
# NOTE: will probably need to parse the 'pr_data["elements"]' as a separate DataFrame to read the fields for each employee
# NOTE: do we actually need the global mapping "df_pr_map"? what for?
return pd.DataFrame()
def create_out_employee_form() -> pd.DataFrame:
"""Data for the `Employee form` sheet in the Excel output."""
# NOTE: Absolutely no idea what the logic for this sheet should be.
# instructions.txt says 'payrun_data.xlsx' - if that's supposed to be:
# - 'payrun_data.json' -> where are changes of any kind specified in it?
# - 'payrun_file.xlsx' -> assuming the 'P&T Changes' sheet should be used,
# how exaclty are the 'Pay elements' changes in it supposed to be the core of what will be written here?
return pd.DataFrame()
def get_excel_writer() -> pd.ExcelWriter:
"""A WriteHandle for the file spcified by `OUT_RESULT_PATH`.
Creates the initial scaffold using the file specified by `OUT_TEMPLATE_PATH`.
"""
dst = POC.joinpath(OUT_RESULT_PATH)
shutil.copyfile(POC.joinpath(OUT_TEMPLATE_PATH), dst)
return pd.ExcelWriter(
dst,
engine="openpyxl",
mode="a",
if_sheet_exists="overlay",
engine_kwargs={
# "keep_vba": True,
# "rich_text": True,
},
)
def write_to_excel(w: pd.ExcelWriter, df: pd.DataFrame, loc: OutLocation) -> None:
"""Write `df` to `loc`, using `w`."""
df.to_excel(
w,
index=False,
header=False,
sheet_name=loc["sheet"],
startrow=loc["row"],
startcol=loc["col"],
)
def do_poc():
df_pr_xls_main, _, _, _ = read_payrun_excel()
payrun_meta, df_pr_data, df_pr_map = read_payrun_data()
df_pr_map_local = read_payrun_data_mapping()
df_ba = read_bank_accounts(
employee_ids=df_pr_xls_main.index,
payrun_meta=payrun_meta,
)
df_ee = read_employees(
employee_ids=df_pr_xls_main.index,
payrun_meta=payrun_meta,
)
out_employees_info = create_out_employees_info(payrun_meta)
out_employees_green = create_out_employees_green(df_pr_xls_main, df_ee, df_ba)
out_employees_red = create_out_employees_red(df_pr_xls_main, df_pr_data, df_pr_map, df_pr_map_local)
out_employee_form = create_out_employee_form()
with get_excel_writer() as w:
write_to_excel(w, out_employees_info, OUT_EMPLOYEES_INFO_LOCATION)
write_to_excel(w, out_employees_green, OUT_EMPLOYEES_GREEN_LOCATION)
write_to_excel(w, out_employees_red, OUT_EMPLOYEES_RED_LOCATION)
write_to_excel(w, out_employee_form, OUT_EMPLOYEE_FORM_LOCATION)
if __name__ == "__main__":
# NOTE: the "special logic for the column Position" for "poc_2" is not done.
# for p in ("./poc_1", "./poc_2"):
for p in ("./poc_1",):
POC = Path(p)
do_poc()