本文主要是介绍卡方最优分箱,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
```python
def Chi2(df, total_col, bad_col,overallRate):'''#此函数计算卡方值:df dataFrame:total_col 每个值得总数量:bad_col 每个值的坏数据数量:overallRate 坏数据的占比: return 卡方值'''df2=df.copy()df2['expected']=df[total_col].apply(lambda x: x*overallRate)combined=list(zip(df2['expected'], df2[bad_col]))chi=[(i[0]-i[1])**2/i[0] for i in combined]chi2=sum(chi)return chi2
> #最大分箱数分箱 def ChiMerge_MaxInterval_Original(df, col, target,max_interval=5):
> '''
> : df dataframe
> : col 要被分项的特征
> : target 目标值 0,1 值
> : max_interval 最大箱数
> :return 箱体
> '''
> colLevels=set(df[col])
> colLevels=sorted(list(colLevels))
> N_distinct=len(colLevels)
> if N_distinct <= max_interval:
> print("the row is cann't be less than interval numbers")
> return colLevels[:-1]
> else:
> total=df.groupby([col])[target].count() #按分类特征分组,计算总数
> total=pd.DataFrame({'total':total})
> bad=df.groupby([col])[target].sum() #按分类特征分组,计算不良总数
> bad=pd.DataFrame({'bad':bad})
> regroup=total.merge(bad, left_index=True, right_index=True, how='left')
> regroup.reset_index(level=0, inplace=True) #自动重置索引,添加从0到n的索引值
> N=sum(regroup['total'])
> B=sum(regroup['bad'])
> overallRate=B*1.0/N
> groupIntervals=[[i] for i in colLevels]
> groupNum=len(groupIntervals)
> while(len(groupIntervals)>max_interval):
> chisqList=[]
> for interval in groupIntervals:
> df2=regroup.loc[regroup[col].isin(interval)]
> chisq=Chi2(df2,'total','bad',overallRate)
> chisqList.append(chisq)
> min_position=chisqList.index(min(chisqList))
> if min_position==0:
> combinedPosition=1
> elif min_position==groupNum-1:
> combinedPosition=min_position-1
> else:
> if chisqList[min_position-1]<=chisqList[min_position + 1]:
> combinedPosition=min_position-1
> else:
> combinedPosition=min_position+1
> #合并箱体
> groupIntervals[min_position]=groupIntervals[min_position]+groupIntervals[combinedPosition]
> groupIntervals.remove(groupIntervals[combinedPosition])
> groupNum=len(groupIntervals)
> groupIntervals=[sorted(i) for i in groupIntervals]
> print(groupIntervals)
> cutOffPoints=[i[-1] for i in groupIntervals[:-1]]
> return cutOffPoints
这篇关于卡方最优分箱的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!