-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathfico.py
78 lines (60 loc) · 2.18 KB
/
fico.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
"""Reading FICO Data"""
from __future__ import print_function
import numpy as np
import pandas as pd
DATA_DIR = '../'
PERF = 'transrisk_performance_by_race_ssa.csv'
CDF_BY_RACE = 'transrisk_cdf_by_race_ssa.csv'
OVERALL = 'totals.csv'
FILES = dict(cdf_by_race=CDF_BY_RACE,
performance_by_race=PERF,
overview=OVERALL
)
def cleanup_frame(frame):
"""Make the columns have better names, and ordered in a better order"""
frame = frame.rename(columns={'Non- Hispanic white': 'White'})
frame = frame.reindex(columns=['Asian', 'White', 'Hispanic', 'Black'])
return frame
def read_totals(data_dir=DATA_DIR):
"""Read the total number of people of each race"""
frame = cleanup_frame(pd.DataFrame.from_csv(data_dir + FILES['overview']))
return {r: frame[r]['SSA'] for r in frame.columns}
def convert_percentiles(idx):
"""Convert percentiles"""
pdf = [(300, 2.1),
(350, 4.2),
(400, 5.4),
(450, 6.5),
(500, 7.9),
(550, 9.6),
(600, 12.0),
(650, 13.8),
(700, 17.0),
(750, 15.8),
(800, 5.7),
(850, 0),
]
def convert_one(x):
partial = 0
for ((v, s), (v2, _)) in zip(pdf, pdf[1:]):
if partial + s >= x:
return v + (v2 - v) * (x - partial) / s
partial += s
return np.array(list(map(convert_one, idx)))
def parse_data(data_dir=DATA_DIR, filenames=None):
"""Parse sqf data set."""
if filenames is None:
filenames = [FILES['cdf_by_race'], FILES['performance_by_race']]
cdfs = cleanup_frame(pd.DataFrame.from_csv(data_dir + filenames[0]))
performance = 100 - cleanup_frame(pd.DataFrame.from_csv(data_dir + filenames[1]))
return (cdfs / 100., performance / 100.)
def get_FICO_data(data_dir=DATA_DIR, do_convert_percentiles=True):
"""Get FICO data in desired format"""
data_pair = parse_data(data_dir)
totals = read_totals(data_dir)
if do_convert_percentiles:
for v in data_pair:
v.index = convert_percentiles(v.index)
cdfs = data_pair[0]
performance = data_pair[1]
return cdfs, performance, totals