Skip to content

Commit

Permalink
First commit.
Browse files Browse the repository at this point in the history
  • Loading branch information
l0o0 committed Mar 12, 2018
0 parents commit cdff00e
Show file tree
Hide file tree
Showing 9 changed files with 319 additions and 0 deletions.
3 changes: 3 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
*.pyc
/dist/
/*.egg-info
21 changes: 21 additions & 0 deletions LICENSE
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
MIT License

Copyright (c) 2018 Linxzh

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
1 change: 1 addition & 0 deletions MANIFEST.in
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
include README.rst
Empty file added README.rst
Empty file.
5 changes: 5 additions & 0 deletions rosaceae/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
# -*- coding: utf-8 -*-

from . import bins
from . import scorecard
from . import utils
89 changes: 89 additions & 0 deletions rosaceae/bins.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
# -*- coding: UTF-8 -*-
"""
rosaceae.bin
~~~~~~~~~~~~
This module implements data binning.
"""


import numpy as np
import pandas as pd



def bin_var(xarray, border=None):
'''
-xarray : a numpy array
-border : a border list
'''
# 创建8个分区间
if not border:
des = xarray.describe()
print des
if des['75%'] < 7:
step = (des['75%']-des['25%'])/3
else:
step = des['std']/2
step = int(step)
step = np.round(step, 3)
border = [des['50%']+(i-3)*step for i in range(6)]
border = [i for i in border if i >=0]
#print 'old border: %s' % border
if len(border) != 6:
added = [border[-1]+ i*step for i in range(1,6-len(border))]
border.extend(added)
print 'border:%s, step: %s' % (border, step)
else:
print 'border:%s, step: Set' % (border, )
out = {}
for i, j in enumerate(border):
if i == 0:
k = '-inf,%s' % j
tmp = np.where(np.logical_and(xarray>=0, xarray<j))[0]
else:
k = '%s,%s' % (border[i-1],j)
tmp = np.where(np.logical_and(xarray>=border[i-1], xarray<border[i]))[0]
out[k] = tmp
print i,j, k
out['%s,inf' % j] = np.where(xarray>=border[-1])[0]

return out


# 根据数据的分位值来分箱
def bin_quantile(xarray, border=None):
if len(xarray.unique()) < 7:
border = xarray.unique().tolist()
border.sort()
else:
border = [np.percentile(xarray, 0.05),
np.percentile(xarray, 0.2), np.percentile(xarray, 0.5),
np.percentile(xarray, 0.8), np.percentile(xarray, 0.95),
]
#print border

out = {}
for i, j in enumerate(border):
if i == 0:
k = '-inf,%s' % j
tmp = np.where(np.logical_and(xarray>=0, xarray<j))[0]
else:
k = '%s,%s' % (border[i-1],j)
tmp = np.where(np.logical_and(xarray>=border[i-1], xarray<border[i]))[0]
out[k] = tmp
out['%s,inf' % j] = np.where(xarray>=border[-1])[0]
return out



def bin_scatter(xarray, border = None):
'''
'''
out = {}
if not border:
values = list(set(xarray))
border = sorted(values)
for i in border:
out[i] = np.where(xarray == i)[0]
return out
81 changes: 81 additions & 0 deletions rosaceae/scorecard.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
# -*- coding: utf-8 -*-
'''
rosaceae.scorecard
~~~~~~~~~~~~~~~~~~
This module provides functions for credit risk scorecard.
'''

from math import log, e


def getWOE(c, y):
'''Calculate WOE value.
WOE(weight of evidence)
1 indicates good case, 0 indicates bad case.
Args:
-c : dictionary, result of bin function.
-y : pandas.Series or numpy.array, label.
Returns:
'''
totalgood = np.count_nonzero(y)
totalbad = len(y) - totalgood

out = {}

for k in c:
region = y[c[k]]
bad = np.count_nonzero(region)
good = len(region) - bad
#print len(region), good, bad
if bad == 0 or good ==0:
continue
woe = log((float(bad)/b)/(float(good)/g))
out[k] = woe
return out


def get_constant(theta, pdo, basescore):
'''Calculata Shift and Slope
The score of an individual i is given by the formula:
Score(i) = Shift + Slope*(b0 + b1*WOE1(i) + b2*WOE2(i)+ ... +bp*WOEp(i))
where bj is the coefficient of the j-th variable in the model,
and WOEj(i) is the Weight of Evidence (WOE) value for the
i-th individual corresponding to the j-th model variable.
In short formula:
Score = Shift + Slope*ln(Good/Bad)
Score + PDO = Shift + Slope*ln(2* Good/Bad)
where Slope = PDO / ln(2), Shift = Score - Slope*ln(Good/Bad).
Args:
theta: the ratio of Good/Bad. Let good ratio is p, then bad ratio is
(1-p), theta = p/(1-p).
pdo: Point-to-Double Odds. When the odds is doubled, score will increate pdo.
basescore: When the ratio of Good/Bad is theta, the score is basescore.
'''
slope = pdo/log(2, e)
shift = basescore - B * log(float(theta), e)
return (shift, slope)


def getScore(woe_table, xarray):
score = 0
xarray.fillna(0, inplace=True)
for idx in xarray.index[2:]:
value = xarray[idx]
tmp_woe = woe_table[idx]
for k in tmp_woe:
border = pd.to_numeric(k.split(':'))
#print k, border
if value >= border[0] and value < border[1]:
#print idx, value, border, tmp_woe[k]
score += tmp_woe[k]
break
return score
100 changes: 100 additions & 0 deletions rosaceae/utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
# -*- coding: utf-8 -*-
"""
rosaceae.utils
~~~~~~~~~~~~~~
This module provides utility functions that are used within Rosaceae.
Including visulization and summary functions.
"""

import seaborn as sns

from itertools import combinations


# 对模型变量进行遍历分析,将结果保存在DataFrame中
def model_selecter(x_train, x_test, y_train, y_test, start=1, end=None, verbose=False):
result_df = pd.DataFrame(columns=['Var_No', 'Vars', 'train_score', 'test_score','coef', 'inter'])
if not end:
end = x_train.shape[1]
cols = x_train.columns
for n in range(start, end+1):
cols_try = combinations(cols, n)

if not verbose:
print n

for t in cols_try:
tmp_train = x_train.loc[:, x_train.columns.isin(t)]
tmp_test = x_test.loc[:, x_test.columns.isin(t)]
clf = LogisticRegression(random_state=0)
clf.fit(tmp_train, y_train)
train_roc_score = roc_auc_score(y_train, clf.decision_function(tmp_train))
test_roc_score = roc_auc_score(y_test, clf.decision_function(tmp_test))
if verbose:
print "%s\t%s\t%s\t%s" % (n, ','.join(t), train_roc_score, test_roc_score)

row = 0 if pd.isna(result_df.index.max()) else result_df.index.max() + 1
result_df.loc[row] = [n, ','.join(t), train_roc_score, test_roc_score, clf.coef_, clf.intercept_]

return result_df


#####################################################################
# visulization function
#####################################################################
# TODO(l0o0): KS plot is needed.

def bin_plot(out):
df = pd.DataFrame([(k, len(out[k])) for k in sorted(out.keys(), key=lambda x:float(str(x).split(',')[0]))], columns=['Range', 'Number'])
print df
p = sns.barplot(x='Range', y='Number', data=df)
p.set_xticklabels(p.get_xticklabels(), rotation=30)
return p


# 对分箱计算的woe进行绘图
def woe_plot(fea_woe):
for f in fea_woe:
tmp = fea_woe[f].items()
tmp = sorted(tmp, key=lambda x:pd.to_numeric(str(x[0]).split(',')[0]))
x = [i[0] for i in tmp]
y = [i[1] for i in tmp]
print f
p = sns.barplot(x=x, y=y)
p.set_xticklabels(p.get_xticklabels(), rotation=30)
return p


#####################################################################
# summary function
#####################################################################
# TODO: feature importance and IV

def frequent_table(xarray, label, steps):
cols = ['Bins', 'Percent', 'Cumulative_percent', 'Counts', 'Cumulative_Counts']
cols.extend(list(set(label))) # column names
fre_df = pd.DataFrame(columns=cols)

total_length = float(len(xarray))
sum_length = 0

for i,j in enumerate(steps[:-1]):
border = (steps[i], steps[i+1])
value_idx = (xarray >= border[0]) & (xarray < border[1])
tmp = xarray[value_idx]
tmp_length = len(tmp)
sum_length += tmp_length
label_counts = label[value_idx].value_counts()
label_counts_dict = dict(zip(label_counts.index, label_counts))
#print label_counts_dict
row = [str(border),
"%f%%" % (tmp_length/total_length * 100),
"%f%%" % (sum_length/total_length * 100),
tmp_length,
sum_length,
label_counts_dict.get(cols[-2],0),
label_counts_dict.get(cols[-1], 0)]

fre_df.loc[i] = row
return fre_df
19 changes: 19 additions & 0 deletions setup.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from setuptools import setup


def readme():
with open('README.rst') as handle:
return handle.read()

setup(
name='rosaceae',
version='0.0.1',
description='Python pacakge for credit risk scorecards',
long_description=readme(),
author='l0o0',
author_email='[email protected]',
license='MIT',
keywords=['scorecards', 'woe'],
url='',
install_requires=['numpy', 'pandas', 'seaborn']
)

0 comments on commit cdff00e

Please sign in to comment.