First commit.

l0o0 · Mar 12, 2018 · cdff00e · cdff00e
commit cdff00e
Show file tree

Hide file tree

Showing 9 changed files with 319 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,3 @@
+*.pyc
+/dist/
+/*.egg-info
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Linxzh
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/MANIFEST.in b/MANIFEST.in
@@ -0,0 +1 @@
+include README.rst
diff --git a/README.rst b/README.rst
diff --git a/rosaceae/__init__.py b/rosaceae/__init__.py
@@ -0,0 +1,5 @@
+# -*- coding: utf-8 -*-
+
+from . import bins
+from . import scorecard
+from . import utils
diff --git a/rosaceae/bins.py b/rosaceae/bins.py
@@ -0,0 +1,89 @@
+# -*- coding: UTF-8 -*-
+"""
+rosaceae.bin
+~~~~~~~~~~~~
+
+This module implements data binning.
+"""
+
+
+import numpy as np
+import pandas as pd
+
+
+
+def bin_var(xarray, border=None):
+    '''
+    -xarray : a numpy array
+    -border : a border list
+    '''
+    # 创建8个分区间
+    if not border:
+        des = xarray.describe()
+        print des
+        if des['75%'] < 7:
+            step = (des['75%']-des['25%'])/3
+        else:
+            step = des['std']/2
+            step = int(step)
+        step = np.round(step, 3)
+        border = [des['50%']+(i-3)*step for i in range(6)]
+        border = [i for i in border if i >=0]
+        #print 'old border: %s' % border
+        if len(border) != 6:
+            added = [border[-1]+ i*step for i in range(1,6-len(border))]
+            border.extend(added)
+        print 'border:%s, step: %s' % (border, step)
+    else:
+        print 'border:%s, step: Set' % (border, )
+    out = {}
+    for i, j in enumerate(border):
+        if i == 0:
+            k = '-inf,%s' % j
+            tmp = np.where(np.logical_and(xarray>=0, xarray<j))[0]
+        else:
+            k = '%s,%s' % (border[i-1],j)
+            tmp = np.where(np.logical_and(xarray>=border[i-1], xarray<border[i]))[0]
+        out[k] = tmp
+        print i,j, k
+    out['%s,inf' % j] = np.where(xarray>=border[-1])[0]
+
+    return out
+
+
+# 根据数据的分位值来分箱
+def bin_quantile(xarray, border=None):
+    if len(xarray.unique()) < 7:
+        border = xarray.unique().tolist()
+        border.sort()
+    else:
+        border = [np.percentile(xarray, 0.05),
+                  np.percentile(xarray, 0.2), np.percentile(xarray, 0.5),
+                  np.percentile(xarray, 0.8), np.percentile(xarray, 0.95),
+                ]
+    #print border
+
+    out = {}
+    for i, j in enumerate(border):
+        if i == 0:
+            k = '-inf,%s' % j
+            tmp = np.where(np.logical_and(xarray>=0, xarray<j))[0]
+        else:
+            k = '%s,%s' % (border[i-1],j)
+            tmp = np.where(np.logical_and(xarray>=border[i-1], xarray<border[i]))[0]
+        out[k] = tmp
+    out['%s,inf' % j] = np.where(xarray>=border[-1])[0]
+    return out
+
+
+
+def bin_scatter(xarray, border = None):
+    '''
+    '''
+    out = {}
+    if not border:
+        values = list(set(xarray))
+        border = sorted(values)
+    for i in border:
+        out[i] = np.where(xarray == i)[0]
+    return out
diff --git a/rosaceae/scorecard.py b/rosaceae/scorecard.py
@@ -0,0 +1,81 @@
+# -*- coding: utf-8 -*-
+'''
+rosaceae.scorecard
+~~~~~~~~~~~~~~~~~~
+
+This module provides functions for credit risk scorecard.
+'''
+
+from math import log, e
+
+
+def getWOE(c, y):
+    '''Calculate WOE value.
+    WOE(weight of evidence)
+    1 indicates good case, 0 indicates bad case.
+
+    Args:
+        -c : dictionary, result of bin function.
+        -y : pandas.Series or numpy.array, label.
+
+    Returns:
+    '''
+    totalgood = np.count_nonzero(y)
+    totalbad = len(y) - totalgood
+
+    out = {}
+
+    for k in c:
+        region = y[c[k]]
+        bad = np.count_nonzero(region)
+        good = len(region) - bad
+        #print len(region), good, bad
+        if bad == 0 or good ==0:
+            continue
+        woe = log((float(bad)/b)/(float(good)/g))
+        out[k] = woe
+    return out
+
+
+def get_constant(theta, pdo, basescore):
+    '''Calculata Shift and Slope
+    The score of an individual i is given by the formula:
+
+        Score(i) = Shift + Slope*(b0 + b1*WOE1(i) + b2*WOE2(i)+ ... +bp*WOEp(i))
+
+    where bj is the coefficient of the j-th variable in the model,
+    and WOEj(i) is the Weight of Evidence (WOE) value for the
+    i-th individual corresponding to the j-th model variable.
+
+    In short formula:
+
+        Score = Shift + Slope*ln(Good/Bad)
+        Score + PDO = Shift + Slope*ln(2* Good/Bad)
+
+    where Slope = PDO / ln(2), Shift = Score - Slope*ln(Good/Bad).
+
+    Args:
+        theta: the ratio of Good/Bad. Let good ratio is p, then bad ratio is
+            (1-p), theta = p/(1-p).
+        pdo: Point-to-Double Odds. When the odds is doubled, score will increate pdo.
+        basescore: When the ratio of Good/Bad is theta, the score is basescore.
+    '''
+    slope = pdo/log(2, e)
+    shift = basescore - B * log(float(theta), e)
+    return (shift, slope)
+
+
+def getScore(woe_table, xarray):
+    score = 0
+    xarray.fillna(0, inplace=True)
+    for idx in xarray.index[2:]:
+        value = xarray[idx]
+        tmp_woe = woe_table[idx]
+        for k in tmp_woe:
+            border = pd.to_numeric(k.split(':'))
+            #print k, border
+            if value >= border[0] and value < border[1]:
+                #print idx, value, border, tmp_woe[k]
+                score += tmp_woe[k]
+                break
+    return score
diff --git a/rosaceae/utils.py b/rosaceae/utils.py
@@ -0,0 +1,100 @@
+# -*- coding: utf-8 -*-
+"""
+rosaceae.utils
+~~~~~~~~~~~~~~
+
+This module provides utility functions that are used within Rosaceae.
+Including visulization and summary functions.
+"""
+
+import seaborn as sns
+
+from itertools import combinations
+
+
+# 对模型变量进行遍历分析，将结果保存在DataFrame中
+def model_selecter(x_train, x_test, y_train, y_test, start=1, end=None, verbose=False):
+    result_df = pd.DataFrame(columns=['Var_No', 'Vars', 'train_score', 'test_score','coef', 'inter'])
+    if not end:
+        end = x_train.shape[1]
+    cols = x_train.columns
+    for n in range(start, end+1):
+        cols_try = combinations(cols, n)
+
+        if not verbose:
+            print n
+
+        for t in cols_try:
+            tmp_train = x_train.loc[:, x_train.columns.isin(t)]
+            tmp_test = x_test.loc[:, x_test.columns.isin(t)]
+            clf = LogisticRegression(random_state=0)
+            clf.fit(tmp_train, y_train)
+            train_roc_score = roc_auc_score(y_train, clf.decision_function(tmp_train))
+            test_roc_score = roc_auc_score(y_test, clf.decision_function(tmp_test))
+            if verbose:
+                print "%s\t%s\t%s\t%s" % (n, ','.join(t), train_roc_score, test_roc_score)
+
+            row = 0 if pd.isna(result_df.index.max()) else result_df.index.max() + 1
+            result_df.loc[row] = [n, ','.join(t), train_roc_score, test_roc_score, clf.coef_, clf.intercept_]
+
+    return result_df
+
+
+#####################################################################
+# visulization function
+#####################################################################
+# TODO(l0o0): KS plot is needed.
+
+def bin_plot(out):
+    df = pd.DataFrame([(k, len(out[k])) for k in sorted(out.keys(), key=lambda x:float(str(x).split(',')[0]))], columns=['Range', 'Number'])
+    print df
+    p = sns.barplot(x='Range', y='Number', data=df)
+    p.set_xticklabels(p.get_xticklabels(), rotation=30)
+    return p
+
+
+# 对分箱计算的woe进行绘图
+def woe_plot(fea_woe):
+    for f in fea_woe:
+        tmp = fea_woe[f].items()
+        tmp = sorted(tmp, key=lambda x:pd.to_numeric(str(x[0]).split(',')[0]))
+        x = [i[0] for i in tmp]
+        y = [i[1] for i in tmp]
+        print f
+        p = sns.barplot(x=x, y=y)
+        p.set_xticklabels(p.get_xticklabels(), rotation=30)
+    return p
+
+
+#####################################################################
+# summary function
+#####################################################################
+# TODO: feature importance and IV 
+
+def frequent_table(xarray, label, steps):
+    cols = ['Bins', 'Percent', 'Cumulative_percent', 'Counts', 'Cumulative_Counts']
+    cols.extend(list(set(label)))   # column names
+    fre_df = pd.DataFrame(columns=cols)
+
+    total_length = float(len(xarray))
+    sum_length = 0
+
+    for i,j in enumerate(steps[:-1]):
+        border = (steps[i], steps[i+1])
+        value_idx = (xarray >= border[0]) & (xarray < border[1])
+        tmp = xarray[value_idx]
+        tmp_length = len(tmp)
+        sum_length += tmp_length
+        label_counts = label[value_idx].value_counts()
+        label_counts_dict = dict(zip(label_counts.index, label_counts))
+        #print label_counts_dict
+        row = [str(border),
+               "%f%%" % (tmp_length/total_length * 100),
+               "%f%%" % (sum_length/total_length * 100),
+               tmp_length,
+               sum_length,
+               label_counts_dict.get(cols[-2],0),
+               label_counts_dict.get(cols[-1], 0)]
+
+        fre_df.loc[i] = row
+    return fre_df
diff --git a/setup.py b/setup.py
@@ -0,0 +1,19 @@
+from setuptools import setup
+
+
+def readme():
+    with open('README.rst') as handle:
+        return handle.read()
+
+setup(
+    name='rosaceae',
+    version='0.0.1',
+    description='Python pacakge for credit risk scorecards',
+    long_description=readme(),
+    author='l0o0',
+    author_email='[email protected]',
+    license='MIT',
+    keywords=['scorecards', 'woe'],
+    url='',
+    install_requires=['numpy', 'pandas', 'seaborn']
+)