本文主要是介绍N175_标记异常值_自动汇聚某单位的住房公积金 有补发有增减 如何批量找出来,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.rcParams['font.sans-serif'] = ['SimHei'] #定义使其正常显示中文字体黑体
plt.rcParams['axes.unicode_minus'] = False #用来正常显示表示负号
data = pd.read_excel(r'.\N175_标记异常值_数据源.xlsx')
import seaborn as sns
# 使用seaborn获取颜色
cm = sns.light_palette("green", as_cmap=True)
# 色阶实现
data.head(100).style.background_gradient(cmap=cm,subset=["发生额"])
# data
laiyuan | 年 | 月 | 日期 | 交易流水号 | 单位代码 | 公积金账号 | 交易号 | 客户编号 | 客户姓名 | 发生额 | 余额 | 摘要代码 | 摘要 | 凭证种类 | 凭证号 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ./数据源/2012\201208.xls | 2012 | 8 | 2012-08-01 00:00:00 | 流水号0000001 | 3267 | 27234556 | 缴存 | 210102198901101814#0 | 姓名001 | 9340 | 39720.410000 | 1072.000000 | 单位补缴201208 | nan | nan |
1 | ./数据源/2012\201208.xls | 2012 | 8 | 2012-08-01 00:00:00 | 流水号0000002 | 3267 | 27234557 | 缴存 | 43282819810118001X#0 | 姓名002 | 990 | 47718.610000 | 1072.000000 | 单位补缴201208 | nan | nan |
2 | ./数据源/2012\201208.xls | 2012 | 8 | 2012-08-01 00:00:00 | 流水号0000003 | 3267 | 27234558 | 缴存 | 432801198108011031#0 | 姓名003 | 1106 | 42583.660000 | 1072.000000 | 单位补缴201208 | nan | nan |
3 | ./数据源/2012\201208.xls | 2012 | 8 | 2012-08-01 00:00:00 | 流水号0000004 | 3267 | 27234559 | 缴存 | 432801198710011099#0 | 姓名004 | 748 | 32592.650000 | 1072.000000 | 单位补缴201208 | nan | nan |
4 | ./数据源/2012\201208.xls | 2012 | 8 | 2012-08-01 00:00:00 | 流水号0000005 | 3267 | 27234560 |
limit_output extension: Maximum message size of 10000 exceeded with 132577 characters
df = data[["laiyuan","日期","客户姓名","发生额"]]
df.head(100).style.background_gradient(cmap=cm,subset=["发生额"]) #主要是靠这些列
laiyuan | 日期 | 客户姓名 | 发生额 | |
---|---|---|---|---|
0 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名001 | 9340 |
1 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名002 | 990 |
2 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名003 | 1106 |
3 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名004 | 748 |
4 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名005 | 1070 |
5 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名006 | 1196 |
6 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名007 | 826 |
7 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名008 | 854 |
8 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名009 | 832 |
9 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名010 | 1060 |
10 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名011 | 798 |
11 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名012 | 808 |
12 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名013 | 840 |
13 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名014 | 790 |
14 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名015 | 760 |
15 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 | 姓名016 | 940 |
16 | ./数据源/2012\201208.xls | 2012-08-01 00:00:00 |
limit_output extension: Maximum message size of 10000 exceeded with 43779 characters
# 透视表 交叉统计
tongji01=pd.pivot_table(data, index=['laiyuan','日期'], columns='客户姓名', values='发生额',margins=False, aggfunc=[np.sum])
# 最后,要扁平化MultiIndex列,我们可以只连接元组中的值:
tongji01.columns = ['_'.join(col).replace("sum_","") for col in tongji01.columns.values]
tongji01 = tongji01.reset_index()
tongji01.head(10)
laiyuan | 日期 | 姓名001 | 姓名002 | 姓名003 | 姓名004 | 姓名005 | 姓名006 | 姓名007 | 姓名008 | ... | 姓名073 | 姓名074 | 姓名075 | 姓名076 | 姓名077 | 姓名078 | 姓名079 | 姓名080 | 姓名081 | 姓名082 | |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | ./数据源/2012\201208.xls | 2012-08-01 | 9340.0 | 990.0 | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
1 | ./数据源/2012\201209.xls | 2012-09-01 | 954.0 | 990.0 | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
2 | ./数据源/2012\201210.xls | 2012-10-01 | 934.0 | 990.0 | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
3 | ./数据源/2012\201211.xls | 2012-11-01 | 934.0 | 990.0 | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
4 | ./数据源/2012\201212.xls | 2012-12-01 | NaN | NaN | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
5 | ./数据源/2013\201301.xls | 2013-01-01 | NaN | NaN | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
6 | ./数据源/2013\201302.xls | 2013-02-01 | NaN | NaN | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
7 | ./数据源/2013\201303.xls | 2013-03-01 | NaN | NaN | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
8 | ./数据源/2013\201304.xls | 2013-04-01 | NaN | NaN | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
9 | ./数据源/2013\201305.xls | 2013-05-01 | NaN | NaN | 1106.0 | 748.0 | 1070.0 | 1196.0 | 826.0 | 854.0 | ... | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN | NaN |
10 rows × 84 columns
# 按列找异常值,并用语言描述出来
import pandas as pd
cun = tongji01.copy() #将数据块复制,后面发现的异常值进行覆盖
hangzuobiao = tongji01.laiyuan.to_list() # 将来源列转列表
suo = tongji01 #.iloc[:,1:4] # 为了缩小范围,提高实验速度
print(suo.shape)
lieming = suo.columns.to_list() #将列标题转列表
for index,col in enumerate(lieming): # 遍历列名,lie_max = suo[col].value_counts().index[:1].values # 关键点,找到该列频率最高的值gailie_max = lie_max[0] # 取当前值,上面计算出来的是列表for hang in range(suo.shape[0]): # 按行总数遍历 ,if suo[col].dtype in ["int64", "float64"]: #判断是否都是数字danqian = suo.loc[hang,col] # 取出当前值if pd.isnull(danqian) == True or danqian == gailie_max: # 为空跳过,相等跳过continue
# print("当前值:"+str(danqian))
# print("当列出现次数最多的:"+str(gailie_max))yichang = (gailie_max-danqian)/gailie_max
# print(yichang)if abs(yichang)>0.6: # 差额相除取绝对值,看是否超过6成zhi = "异常"+str(hangzuobiao[hang])[:10]+"_"+str(col)+"_:"+str(suo.loc[hang,col])cun.iloc[hang,index] = zhi
cun.to_excel("N175_标记异常值_结果.xlsx")
(107, 84)
from styleframe import StyleFrame
import openpyxl
#可以自我覆盖,但是这样有风险,为了列宽自适应,筛选,冻结窗口
file_path = "N175_标记异常值_结果.xlsx"
df=pd.read_excel(file_path)
col = df.columns.values.tolist()
excel_writer = StyleFrame.ExcelWriter(file_path)
sf = StyleFrame(df)
sf.to_excel(excel_writer=excel_writer, best_fit=col,columns_and_rows_to_freeze='a2', row_to_add_filters=0,
)
excel_writer.save()
# 将句柄关闭,否则是只读格式,打不开
excel_writer.handles = None #这句代码有用,很久才找到
print("well done")
well done
原始数据
处理后的数据
这篇关于N175_标记异常值_自动汇聚某单位的住房公积金 有补发有增减 如何批量找出来的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!