diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..737cce8 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.pyc +/dist/ +/*.egg-info diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..bae9605 --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Linxzh + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..9561fb1 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include README.rst diff --git a/README.rst b/README.rst new file mode 100644 index 0000000..e69de29 diff --git a/rosaceae/__init__.py b/rosaceae/__init__.py new file mode 100644 index 0000000..073138a --- /dev/null +++ b/rosaceae/__init__.py @@ -0,0 +1,5 @@ +# -*- coding: utf-8 -*- + +from . import bins +from . import scorecard +from . import utils diff --git a/rosaceae/bins.py b/rosaceae/bins.py new file mode 100644 index 0000000..df75fef --- /dev/null +++ b/rosaceae/bins.py @@ -0,0 +1,89 @@ +# -*- coding: UTF-8 -*- +""" +rosaceae.bin +~~~~~~~~~~~~ + +This module implements data binning. +""" + + +import numpy as np +import pandas as pd + + + +def bin_var(xarray, border=None): + ''' + -xarray : a numpy array + -border : a border list + ''' + # 创建8个分区间 + if not border: + des = xarray.describe() + print des + if des['75%'] < 7: + step = (des['75%']-des['25%'])/3 + else: + step = des['std']/2 + step = int(step) + step = np.round(step, 3) + border = [des['50%']+(i-3)*step for i in range(6)] + border = [i for i in border if i >=0] + #print 'old border: %s' % border + if len(border) != 6: + added = [border[-1]+ i*step for i in range(1,6-len(border))] + border.extend(added) + print 'border:%s, step: %s' % (border, step) + else: + print 'border:%s, step: Set' % (border, ) + out = {} + for i, j in enumerate(border): + if i == 0: + k = '-inf,%s' % j + tmp = np.where(np.logical_and(xarray>=0, xarray=border[i-1], xarray=border[-1])[0] + + return out + + +# 根据数据的分位值来分箱 +def bin_quantile(xarray, border=None): + if len(xarray.unique()) < 7: + border = xarray.unique().tolist() + border.sort() + else: + border = [np.percentile(xarray, 0.05), + np.percentile(xarray, 0.2), np.percentile(xarray, 0.5), + np.percentile(xarray, 0.8), np.percentile(xarray, 0.95), + ] + #print border + + out = {} + for i, j in enumerate(border): + if i == 0: + k = '-inf,%s' % j + tmp = np.where(np.logical_and(xarray>=0, xarray=border[i-1], xarray=border[-1])[0] + return out + + + +def bin_scatter(xarray, border = None): + ''' + ''' + out = {} + if not border: + values = list(set(xarray)) + border = sorted(values) + for i in border: + out[i] = np.where(xarray == i)[0] + return out diff --git a/rosaceae/scorecard.py b/rosaceae/scorecard.py new file mode 100644 index 0000000..7e894ac --- /dev/null +++ b/rosaceae/scorecard.py @@ -0,0 +1,81 @@ +# -*- coding: utf-8 -*- +''' +rosaceae.scorecard +~~~~~~~~~~~~~~~~~~ + +This module provides functions for credit risk scorecard. +''' + +from math import log, e + + +def getWOE(c, y): + '''Calculate WOE value. + WOE(weight of evidence) + 1 indicates good case, 0 indicates bad case. + + Args: + -c : dictionary, result of bin function. + -y : pandas.Series or numpy.array, label. + + Returns: + ''' + totalgood = np.count_nonzero(y) + totalbad = len(y) - totalgood + + out = {} + + for k in c: + region = y[c[k]] + bad = np.count_nonzero(region) + good = len(region) - bad + #print len(region), good, bad + if bad == 0 or good ==0: + continue + woe = log((float(bad)/b)/(float(good)/g)) + out[k] = woe + return out + + +def get_constant(theta, pdo, basescore): + '''Calculata Shift and Slope + The score of an individual i is given by the formula: + + Score(i) = Shift + Slope*(b0 + b1*WOE1(i) + b2*WOE2(i)+ ... +bp*WOEp(i)) + + where bj is the coefficient of the j-th variable in the model, + and WOEj(i) is the Weight of Evidence (WOE) value for the + i-th individual corresponding to the j-th model variable. + + In short formula: + + Score = Shift + Slope*ln(Good/Bad) + Score + PDO = Shift + Slope*ln(2* Good/Bad) + + where Slope = PDO / ln(2), Shift = Score - Slope*ln(Good/Bad). + + Args: + theta: the ratio of Good/Bad. Let good ratio is p, then bad ratio is + (1-p), theta = p/(1-p). + pdo: Point-to-Double Odds. When the odds is doubled, score will increate pdo. + basescore: When the ratio of Good/Bad is theta, the score is basescore. + ''' + slope = pdo/log(2, e) + shift = basescore - B * log(float(theta), e) + return (shift, slope) + + +def getScore(woe_table, xarray): + score = 0 + xarray.fillna(0, inplace=True) + for idx in xarray.index[2:]: + value = xarray[idx] + tmp_woe = woe_table[idx] + for k in tmp_woe: + border = pd.to_numeric(k.split(':')) + #print k, border + if value >= border[0] and value < border[1]: + #print idx, value, border, tmp_woe[k] + score += tmp_woe[k] + break + return score diff --git a/rosaceae/utils.py b/rosaceae/utils.py new file mode 100644 index 0000000..3047892 --- /dev/null +++ b/rosaceae/utils.py @@ -0,0 +1,100 @@ +# -*- coding: utf-8 -*- +""" +rosaceae.utils +~~~~~~~~~~~~~~ + +This module provides utility functions that are used within Rosaceae. +Including visulization and summary functions. +""" + +import seaborn as sns + +from itertools import combinations + + +# 对模型变量进行遍历分析,将结果保存在DataFrame中 +def model_selecter(x_train, x_test, y_train, y_test, start=1, end=None, verbose=False): + result_df = pd.DataFrame(columns=['Var_No', 'Vars', 'train_score', 'test_score','coef', 'inter']) + if not end: + end = x_train.shape[1] + cols = x_train.columns + for n in range(start, end+1): + cols_try = combinations(cols, n) + + if not verbose: + print n + + for t in cols_try: + tmp_train = x_train.loc[:, x_train.columns.isin(t)] + tmp_test = x_test.loc[:, x_test.columns.isin(t)] + clf = LogisticRegression(random_state=0) + clf.fit(tmp_train, y_train) + train_roc_score = roc_auc_score(y_train, clf.decision_function(tmp_train)) + test_roc_score = roc_auc_score(y_test, clf.decision_function(tmp_test)) + if verbose: + print "%s\t%s\t%s\t%s" % (n, ','.join(t), train_roc_score, test_roc_score) + + row = 0 if pd.isna(result_df.index.max()) else result_df.index.max() + 1 + result_df.loc[row] = [n, ','.join(t), train_roc_score, test_roc_score, clf.coef_, clf.intercept_] + + return result_df + + +##################################################################### +# visulization function +##################################################################### +# TODO(l0o0): KS plot is needed. + +def bin_plot(out): + df = pd.DataFrame([(k, len(out[k])) for k in sorted(out.keys(), key=lambda x:float(str(x).split(',')[0]))], columns=['Range', 'Number']) + print df + p = sns.barplot(x='Range', y='Number', data=df) + p.set_xticklabels(p.get_xticklabels(), rotation=30) + return p + + +# 对分箱计算的woe进行绘图 +def woe_plot(fea_woe): + for f in fea_woe: + tmp = fea_woe[f].items() + tmp = sorted(tmp, key=lambda x:pd.to_numeric(str(x[0]).split(',')[0])) + x = [i[0] for i in tmp] + y = [i[1] for i in tmp] + print f + p = sns.barplot(x=x, y=y) + p.set_xticklabels(p.get_xticklabels(), rotation=30) + return p + + +##################################################################### +# summary function +##################################################################### +# TODO: feature importance and IV + +def frequent_table(xarray, label, steps): + cols = ['Bins', 'Percent', 'Cumulative_percent', 'Counts', 'Cumulative_Counts'] + cols.extend(list(set(label))) # column names + fre_df = pd.DataFrame(columns=cols) + + total_length = float(len(xarray)) + sum_length = 0 + + for i,j in enumerate(steps[:-1]): + border = (steps[i], steps[i+1]) + value_idx = (xarray >= border[0]) & (xarray < border[1]) + tmp = xarray[value_idx] + tmp_length = len(tmp) + sum_length += tmp_length + label_counts = label[value_idx].value_counts() + label_counts_dict = dict(zip(label_counts.index, label_counts)) + #print label_counts_dict + row = [str(border), + "%f%%" % (tmp_length/total_length * 100), + "%f%%" % (sum_length/total_length * 100), + tmp_length, + sum_length, + label_counts_dict.get(cols[-2],0), + label_counts_dict.get(cols[-1], 0)] + + fre_df.loc[i] = row + return fre_df diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..84d7414 --- /dev/null +++ b/setup.py @@ -0,0 +1,19 @@ +from setuptools import setup + + +def readme(): + with open('README.rst') as handle: + return handle.read() + +setup( + name='rosaceae', + version='0.0.1', + description='Python pacakge for credit risk scorecards', + long_description=readme(), + author='l0o0', + author_email='linxzh1989@gmail.com', + license='MIT', + keywords=['scorecards', 'woe'], + url='', + install_requires=['numpy', 'pandas', 'seaborn'] +)