From 599323b8d95bbdf4418cf56d30d2ae392b89ea1d Mon Sep 17 00:00:00 2001 From: zhaoxingfeng Date: Sat, 22 Dec 2018 16:10:21 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E5=A4=8D=E5=8E=9F=E5=A7=8B=E7=89=B9?= =?UTF-8?q?=E5=BE=81=E5=80=BC=E5=90=91woe=E8=BD=AC=E6=8D=A2=E7=9A=84bug?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- result/woe_rule.csv | 2 +- result/woe_rule.pkl | Bin 0 -> 26112 bytes woe.py | 20 ++++++++++++-------- 3 files changed, 13 insertions(+), 9 deletions(-) create mode 100644 result/woe_rule.pkl diff --git a/result/woe_rule.csv b/result/woe_rule.csv index c7eeae1..2b004e1 100644 --- a/result/woe_rule.csv +++ b/result/woe_rule.csv @@ -1,4 +1,4 @@ -var_name,bin_value_list,split_left,split_right,sub_sample_cnt,sub_sample_bad_cnt,sub_sample_good_cnt,woe,iv,iv_sum +var_name,bin_value_list,split_left,split_right,sub_sample_cnt,sub_sample_bad_cnt,sub_sample_good_cnt,woe,iv_list,iv_sum PAY_0,,-inf,-1.0,8445,1319,7126,-0.4282,0.0455,0.8694 PAY_0,,-1.0,0.0,14737,1888,12849,-0.6591,0.1749,0.8694 PAY_0,,0.0,1.0,3688,1252,2436,0.5931,0.0501,0.8694 diff --git a/result/woe_rule.pkl b/result/woe_rule.pkl new file mode 100644 index 0000000000000000000000000000000000000000..126ebae40771408cd9c13cb546f0b1b653f6cb50 GIT binary patch literal 26112 zcmeG^33wDmvkQa}!V&I6ZsZ6cfrKQYh`pQxgv*6R^mko0VPO(-)oj8g%cYD6$RQwy za>*qKh~W}JCR`B|(TB*5B8U743LfBv`l_dEdKUx?zx?n0?|s?*zU}Vn>h7xQuA1uJ znjT#^$?Z<_B&WOG;g4r{J9<4?*_yYfmXYN3`NF~0XbcqcL*Yng>8y+_PpW%hMn->k zQhF8?>DK-q&Qh74EY&?E!|T?vQ@s%4EU$Te*{NC0EKpREZ!i>fma@w3{>e#M5bB3w zkXC zd3z5XN-B2yXwlr*Gvc`oC-RsMr4)Nd3%Gdl_uWz|)zqQ1GU^I7XL+T*j+}HpM285k zvq-NFeW8pC%0{}NTvB>=TIR5*qzug)m4>_|do;~6EZmpmN%d&qP+ruB_s<%Z=?#Yp z&cewpp<;q9q0sYC$pMuk6N)$sJpxs_ckkZi5B!)v3AQY#n&=EGEJnA%eL7TgKy{B| ztWVp0sHOgd4mDg*Q!TGnc7_$Quxja0+W~bv3VYXYbQ<-+OFGo`I*ayf^MpGd?sq}G zNF|qW6(hGu#}lrk@jBF342w(S;PsxJDqSh2Lj#2nZHlq?{oH5ErO(!(VURqoE^O%N z-EXB1ku*XqG;%@X;1TkYSGV1{_p%O6f<`!2Ha6s|E|YY4AZUdB^-easZ^1Slng)$< zZD7k9$c7G4G(sFSb3yaq5q2E*RY8h%hz=Uz$nqLr&OP{m4lzNU87n`W|LoZYI<%n9 zVj$K9alxJKU-J6)-g|W&;)6On`p(mFY{WM@Jg6`!rK3MBoZ9qMDIHn{jqp>*m<*eG zLWhTFgck6y3t9z_u+N&iM}kH;KCb55k6j$6L+hXs4xF7l0X|F7;nAQGuD-JLOs^k5 z(t*(k&B5-1Hj%-d4P5wh@j>tC&^AcA=PLI-UDb;MX%`rQo&IIj(SJVLR)_Y15$v|r zZ!B=W(prZOUT5*Pox65*x9Qe94m!F(L8Tko&s-A<+dA7FdTmBW9XcuY#p4Cg*yPSQ zQ0u#MI&@a-dxS12#s?#({xPdwmJVGM`=N_JhPTdKqOiO@>mTY{CQ*k3#eQMp@0@#u zZK>Ywfu+NT>d;lOU;DUx3;Za?F1{K=C+pBHK%|l58By)nW2h0`gNTfU9xmvKL_YWX z0RBsql+ta(p1;0X2d85Hd3fC^Peoo+jJLYC`Y`VGzB=>? zBGOpG&h4#HQHRHah>VB6E_i}V@`5?%3R|7iv~-`4syaNW*q3Y;lp7l@ZTN>`fA5>B zbLPyMs~G!`=c@ZrGoL~ttH%i<_rANd?5n2+>)=93maP0#fB691*b~DF>*4LpQ64|wLx6ZxYZQFDOj|&o!$TKvv?A*2A>iiMa zS%)OW{%vI;H1_fFiL0RGIvx5eZ3br=-XHrE-$*M$zL zL2Y9o%?0V$_S$-N-@bhxDs2AzYGHE=XX}s=)OIJ=tNllH$n-kHxV6Mp9}nOHEi#}+ z>~O;^%`Wvhq=QefPZ-cT69ucV9T8Js-ZFcS4q1wQft_TtVt+AIhT1x0EA}@DXl!qN zbX4L-l+9qpZdky>33W{k!*8Xba+Ow&uC90 zuQHV=RfiElT8flBmz5IJXXlxCM6*I|5soEGqc3nm1UbIP)06VXH! zj}V%%Xp?CPb(o~s-V>U>dHlUT<(aeo0}cO}nA!@RDMml_~mTnl@m1O4W@SX$qAUE$@j?Z2fyf3gM&mZWp&jBBz7r4p$+k|7A zuj;Vh1s@44$sEqWe>va)V(p%$YHMrn*5MOxwzEXQ^Bw>PlQKQ&{XIV6+z!w@Y2NVm zo-9vCGY6ldvwz4Bha;WE-D%z|Pk*d}BQ7}V3~{5k3&)(rImm1W$DKvpCc=puI!#W` z@@naxR9|@8)QqG--8|`@fnIc*Ph#Jnp=>{oya~O(cc|AFmFV$#!`6^O=_|6&bNlncd;!aERWWx6^ zWG9X;e4CwzTk|Gmr2De8?4&H*k=@)=p#ihGN#2od%V?Sx_Z0${AwP^+R`j@LzSnHTBlZMf0Q&^oBe5FD;rM{qPgc zv@gq}Wx>x0HfM-0%bN+mBoyMgj7<17Az3xMz3zbDye{}XQspb;55HzZwT)EIt6!;K zCxj#<>slcMsF&3%>bL55>OG%9KE&UjLvyd#97?O^S8N?)AVM|2qSyk;sODD`W1*~S zeyx3GIo13IJCx~f1q|#TyVxh7szY-b`RaE(v&^V~7 z%CDd>4(?O&8*Im`srVJ=3)NNm4c-COP=mgKig?_D2Ymw-u~175`UWbZp|&c&;vk`p zS|{lFR#&YXz>b0Y)%yd*7En*Ehx+%ge_T*M@-}9}I`Z5+_n^OurxLA#qeZI8QO2Xw?v zm755`bB4|i=z=u~k)lfS-W9i~4(NVUS3Pv->43+OatAe#O!dNzr30Lzfh4;R?iU@< z*UQOP{*dPlR>m4Gu#J~Exl|W83y{YZe-lRzdWU}gj3alCa>poliE?KscZ0HL>B~z0 zRr;&}9Lv*|-nI0or57!cC1G+CART|{?B4>}+Q7(7i`>S^&B-lVS(`$+5e&4lHhi~e zW$oPpt*kwo+?UB#xx|m0XRhJj<$yVAFy=KqSm1)Bd$`W9$@7y2f*@}}ZR zeGBwu>6-;XU;T;1DDFRNh4De@*NWgTccCkz3tfb}&>;v2xX|OX6Mw5MR@^lDLr5(x z$`6+mX&#kd6DO3m$*rlrtVpj$h%|=kXHz|t5%BAjK3GYV6}w0`mJ(^L@*+*8`uR+h zi_!Rpsr^HQx01#=Mdd{5C$@@cSG}T0$F~(}d7^I^@ll?}`GwL^kpgcZ@za{{deS&c zqD1{gsy|;slvAi2R#lW6Q2KK@QJzETO2W&dc15d;`VB(Z#*D_RJh5FC!DDX0)MEVl_?Kz+1)!rlEmq>28F2bk&8j@$; zub6CI0MCZqZ_X^=nB)mDq z&jjlKGS&AW{Jn&?i~4hu{H789VZxhD?P}6^pHuxgYPXKcGYEet;pb5QFOwXcgjbZt z*-H4634avTTk^R?>%1b3TbS0{r_|osul1&O3Dn;_N=wr`&n5gbln$Z#ENb@v!CMnQ zDO8_8^^D+?Zi1htdOOt@rg9G9{X+egB6vGWBPrcT?LH;^%aksm`f((;C@PO7{2vIf z7WKc5*2_b*{>qV@j*u%3?IyZP)D!g{s{e`PQ;q5w^^-{a&n3Jt zs{e$_QN+i2!aGECZ>92i!Yf1cAEbIe@%t^!+eYfA7x8Z={1*w{j{2!f?LL24&|l?# zk?w0G(tdit~P7?7omhj3^If3Lhi|D#W_1OeZCpo=GaIoY; z>c!!gk{G4Z>eb?dwyTLeHxv7CnSto$7z4{!6tKd_G8W z8XYalyBdj}sU;xBgs@sxc)ORX4|%!4V7hrKwo;=I-j zq0z!Ii>~GkCsbMq1SVSUV@WLB;6A=HsQAwd2Oc*b@K!zDxAHlo?W6xZy6TtHto19- z1#hgj+4a4>x4*t*zwylo7Wec!M~zaanoeE#(^2DuYRecC@)LVPZ+U2U+CF34#mC;R zIqtZT@)=B+eD;9x(T}~(EPdrFyS#U2+ttZ=#-uJ^g~XOUW^7wLudrqlyU-vyY*qCG#)VEC;gHcgtKO_Xrk*ywnwf)`=h)yLH7libTWmbrtn0Gy zjwcP?O8MeTLn$(}M7_wxo2utQ(IoE0fPp;pp2E7@3I#~If` z-!m$jj2|@~ztjv-&lrdL@XTShXZDttr$+2xwGU*ti|t>*)@)21b+q+%BWK>_<#`E* zjnYkd?yO<5pM8X-4jsPqfp)8ysNcj6$6jvPLD^xPzQ7qiXiVwF1#s3tAy)An-eEj& zaPF5imLD-1oPBowhbs*x;O`q(Hy@kun|Bl2U3;yjPTS1}HMutHK#D?ca>@m0$zn{F=2su9~^6OH0jC19ly_KE&t`<_h$uPw}@&hBWOy7sr@5y5( znbo(j$)S8gcNzCRcC6Y6PmWPFqz|&Rnu+<}XcXID6>IhxgHOF$cImr&*|5H^$9>pp z8{=G7znEjx4;?eJ-NdcNiNj-1k{g*h{I)%YIn3ZShB=KR_p!~_pa0?b;kTICy!%^5 z#Tt9hu_cC)^v#pFl()0*Cv(BPX}oaYvr{Fe?`0piKXT=VE-P5t?%#8s`EISjXS-;* zU4~f4>y4K_7==soUG@n_Uy)->I?fmRdX`;yZdkSX^V!k;d|~f2aPd9aztsZ9xr=e; z(!4EapB3f{AdiW4v4Zj1=8ey=UN6^~oDI(~A%}U)1g}`f9yJ+1PU~P86LMe3s-c#I zj#*0U;!`Hp5wQ3VTXp}hX|FQ<$xpVPsW6YVHYfTM_6%R%!~3&xOW)mIxxsX1PE*&J zZ1@*$kDbeseiM&a8cJ|B2l&x9QRmOXn8#a+OZ@i-X$L(Kd>u@r94q7SkM89?><|j+?6Z7So z`naT7-QTWdpL%r>jXQwk*-n->BQ=#eO(lAkuvVY= z(b`pet%aAux|A&3Nsd(A; zKE*B3PWD&n7Oq~Ea@k*{;`z6Qn))LPTc5{kC9(($|+3eb>>1OijKI8KD`P_Fzxi=mSu6(p!h0QC z5%RwpTY-nF03sQMBF-ZqznqDtR+A)IQM_iNQnh!jo@j`|-h95XJa zbRaG!LdM11ztqA_Yh%Yrc$s4XkA{_T{A@~Y} z)(G1$@pt%N{}#yk z_+S4Pt@81|{w-SN<9~hGDwp_?vxBQSFn4$LzrO6VAdW@|HSnU2VJ%0wdYAt&H$n~t0XISp1pzlg4g~=>LJkE1H$n~t0XISp1pzlg4*#hEH|QE}$uR?3A`uv90Sa|sjt+DX( z8(L%G-5gqDm2VEMvC8=kt+DX#HnhgVzuV9n3;+L0h)uRP8?P8?h3;NrcA_&p4gcU5 z7kFroAgtTlC7clfm|OGUH7l*LLSdWDn9I4&P+pxpSbNlj+7r7S_;$s0X9!+hGC*Sp zQ}KEk&5jwr)&_qg)g$Ur^_Y5GJ)z>|Fj`voA{t5zSIE+2=+9V6;R=whymSDbBA{5!cK_1l_N4L-a-`UOth}}i@{OdOdS5AWp~W^wH{_!yl7b{p{lO+L_k1P=*DCD zjvuT3KP}}~Y6gGIz>>$1O26j7Un{VrFlV|K0s>6m>-E8hxCa-s;Dt%?vDO7G;+mHH z{03jg5`S9SqOICE-6G*#5dy;ctGW!pcHNostnD=Wg6C#)`JWhL>qsoo^ zJy(Xm%`9Fz7BG&bgN$PY&TScsZ8vh^(E8yohPHVyNf9A!+@3|I@L;wS{sc)8acoH( z>eu?4m4Uy_@UC8d5aLC!LmR-+<= zccWmhhqnCk^cpjBK644#nDE#%fw9&jRcg)6M#;=XVUrTo| z9{X8d`)@PK_rilUuSQ45wS&AQ6CGE1c_%rp$mesMlT#@|qT@_1g+z7C`?cpyh?MAf zjxG8AONqXK=nicH{(Ah{L`=x)Pr@u&eLnMK^!1si?StW80%Lz~L6rO}^e zHh76QAhU*3GaZWo67RK9`yZQ{Y!Bbo%|K=n)nm?3p@-T`&fE`yhphH80#KDpiA(MSSsFaZ^xjddxc$8#|6*XH6c>T!yx#wSupGhg9Qm-Z?qa(&D*F~$0r zk9ohgz=VkPQDL06kYfk04{NVNujSk5^>-V+CU3J|x2M;hZS*>`jb3}U(d*1MdOg~D zE#F43S=;C}ZCe={h+fCG(d)}LdJWx1uhHA+b#ohCA3&PF2+4M6ui>x9r7h+>aLr$W z8Nap^f34+(94_OvlEZtwyeixacPwy$cxkbLh!A^%=6K&HZ6L~uObaL{GO<9Lj5uqP z5s#Y;`S^;)jfO15;ATS>THuC57Gi@p9eBNCpe7zS9o9x68uuJlJ_ffOR=x%9IIKKg z+bHvKK^u;EsDoZ*-NY!Ir{#PmEX(z}IZI+?uE4xsTWLb%%6x-k%Xve~^?ze!u0paM z+G_ko3%UjqvY>BbmMkdOP_m$FG3(dfGV6jZXa(*V<{;Rit>f6zg043kY~T&Zg5nb2 hh{b@Y3YPf)(h?U>fK50V_ykB1mOn+<`|QN1{{~#0$%6m@ literal 0 HcmV?d00001 diff --git a/woe.py b/woe.py index fbec6b3..3ee0dee 100644 --- a/woe.py +++ b/woe.py @@ -2,8 +2,8 @@ """ @Time: 2018/8/21 11:34 @Author: zhaoxingfeng -@Function:Weight of Evidence,根据iv值最大思想求最优分箱 -@Version: V1.2 +@Function:Weight of Evidence,基于iv值最大思想求最优分箱 +@Version: V1.3 参考文献: [1] kingsam_. 数据挖掘模型中的IV和WOE详解[DB/OL].https://blog.csdn.net/kevin7658/article/details/50780391/. [2] boredbird. woe[DB/OL].https://github.com/boredbird/woe. @@ -12,6 +12,7 @@ import pandas as pd import matplotlib.pyplot as plt import copy +from sklearn.externals import joblib pd.set_option('display.max_rows', 500) pd.set_option('display.width', 1000) pd.set_option('display.max_columns', 1000) @@ -120,7 +121,7 @@ def fit(self, dataset): self.woe_rule_df = var_df if self.woe_rule_df.empty else pd.concat([self.woe_rule_df, var_df], ignore_index=1) cols = ['var_name', 'bin_value_list', 'split_left', 'split_right', 'sub_sample_cnt', 'sub_sample_bad_cnt', - 'sub_sample_good_cnt', 'woe', 'iv', 'iv_sum'] + 'sub_sample_good_cnt', 'woe', 'iv_list', 'iv_sum'] self.woe_rule_df = self.woe_rule_df.sort_values(by=['var_name', 'split_left'], ascending=True) self.woe_rule_df = self.woe_rule_df.sort_values(by=['iv_sum', 'var_name'], ascending=False) self.woe_rule_df = self.woe_rule_df[cols].reset_index(drop=True) @@ -131,6 +132,7 @@ def fit(self, dataset): self.woe_rule_dict[var] = list(zip(grp.bin_value_list, grp.woe)) else: self.woe_rule_dict[var] = list(zip(grp.split_right, grp.woe)) + del self.dataset # 处理连续型变量 def fit_continous(self, dataset, var): @@ -146,9 +148,9 @@ def fit_continous(self, dataset, var): "sub_sample_bad_cnt": [x['sub_sample_bad_cnt'] for x in woe_iv_list], "sub_sample_good_cnt": [x['sub_sample_good_cnt'] for x in woe_iv_list], "woe": [x['woe'] for x in woe_iv_list], - "iv": [x['iv'] for x in woe_iv_list] + "iv_list": [x['iv'] for x in woe_iv_list] }) - var_df['iv_sum'] = var_df['iv'].sum() + var_df['iv_sum'] = var_df['iv_list'].sum() return var_df # 处理连续型变量 @@ -208,9 +210,9 @@ def fit_discrete(self, dataset, var): "sub_sample_bad_cnt": [x['sub_sample_bad_cnt'] for x in woe_iv_list], "sub_sample_good_cnt": [x['sub_sample_good_cnt'] for x in woe_iv_list], "woe": [x['woe'] for x in woe_iv_list], - "iv": [x['iv'] for x in woe_iv_list] + "iv_list": [x['iv'] for x in woe_iv_list] }) - var_df['iv_sum'] = var_df['iv'].sum() + var_df['iv_sum'] = var_df['iv_list'].sum() return var_df # 处理离散型变量 @@ -320,7 +322,7 @@ def transform(self, dataset): @staticmethod def _transform_continous(sub_woe_rule, value): for rule in sub_woe_rule: - if rule[0] > value: + if rule[0] >= value: return rule[1] return -99 @@ -339,6 +341,8 @@ def _transform_discrete(sub_woe_rule, value): min_sample_rate=0.1, min_iv=0.0005) woe.fit(df) + joblib.dump(woe, "result/woe_rule.pkl") + woe = joblib.load("result/woe_rule.pkl") print(woe.woe_rule_df) woe.plot_woe_structure() df_woed = woe.transform(df)