本文主要是介绍python数据预处理练习,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
[python] view plain copy
- #ecoding=utf-8
- import math
- import re
- import csv
- def fileREAD(fileURL,access):
- "传入文件路径,返回存储文件内容的二维列表"
- localArray = [] # 创建一个列表用于存储文件内容
- csvfile = file(fileURL, access)
- reader = csv.reader(csvfile)
- for line in reader:
- localArray.append(line)
- csvfile.close()
- return localArray
- def getLine(inList,Line):
- "获得某一行数据"
- return inList[Line]
- def getRow(inList,Row):
- "获得某一列数据"
- listReturn = []
- for i in inList:
- listReturn.append(i[Row])
- return listReturn
- def setLine(inList,childList,Line):
- "设置矩阵某一行数据"
- inList[Line] = childList
- def setRow(inList,chikdList,Row):
- "设置矩阵的某一列"
- i = 0
- for i in range(0,len(chikdList)):
- inList[i][Row] = chikdList[i]
- def addLine(inList,childLine):
- "给数据矩阵添加一行"
- inList.append(childLine)
- def addRow(inList,childRow):
- "给数据矩阵添加一列"
- j = 0
- for i in inList:
- i.append(childRow[j])
- j = j+1
- def getAVG(inList):
- "求数值属性的均值"
- sumOfList = 0
- lengOfList = 0
- for i in inList:
- if re.match(r'[0-9]+',i):
- sumOfList = sumOfList + float(i)
- lengOfList = lengOfList + 1
- else:
- continue
- if lengOfList != 0 :
- return sumOfList/lengOfList
- else:
- return "当前特征无平均值"
- def getAVE(inList):
- "求数值属性的方差"
- #先求平均数
- sumOfList = 0
- lengOfList = 0
- su = 0
- for i in inList:
- if re.match(r'[0-9]+', i):
- sumOfList = sumOfList + float(i)
- lengOfList = lengOfList + 1
- else:
- continue
- if lengOfList != 0:
- avg = sumOfList / lengOfList
- for j in inList:
- if re.match(r'[0-9]+',j):
- su += (float(j) - avg) ** 2
- else:
- continue
- return math.sqrt(su)
- else:
- return "当前特征无方差"
- def average(seq, total=0.0):
- num = 0
- for item in seq:
- total += item
- num += 1
- return total / num
- def getQUANTILE(inList,inlocaltion):
- "求数值属性的分位数"
- if inlocaltion >1 or inlocaltion<0 or inlocaltion == 1:
- return "输入的分位数数值错误"
- localLst = []
- leng = 0
- for i in inList:
- if re.match(r'[0-9]+',i):
- localLst.append(float(i))
- leng = leng + 1
- else:
- continue
- if leng == 0:
- return "当前特征不可求中位数"
- localLst.sort()
- if inlocaltion == 0.5:
- if len(localLst)%2 == 1:
- return localLst[len(localLst)//2]
- else:
- return (localLst[len(localLst)//2-1]+localLst[len(localLst)//2])/2.0
- elif inlocaltion<1 and inlocaltion>=0:
- return localLst[int(len(localLst)*inlocaltion)]
- def fileREAD(fileURL,access):
- "传入文件路径,返回存储文件内容的二维列表"
- localArray = [] # 创建一个列表用于存储文件内容
- csvfile = file(fileURL, access)
- reader = csv.reader(csvfile)
- for line in reader:
- localArray.append(line)
- csvfile.close()
- return localArray
- def removeNoiseAuto(inList):
- "利用IRQ识别噪声数据并去除该数据"
- Q3 = getQUANTILE(inList,0.75)
- Q1 = getQUANTILE(inList,0.25)
- IRQ = Q3 - Q1
- for i in range(1,len(inList),1):
- if float(inList[i]) - Q3 > 1.5*IRQ or Q1 - float(inList[i]) > 1.5*IRQ:
- inList[i] = ''
- return inList
- def removeNoiseByThresholdMin(inList,inThresholdMin):
- "根据最小阈值去除噪声数据去除该数据"
- for i in range(1, len(inList), 1):
- if float(inList[i]) < inThresholdMin:
- inList[i] = ''
- return inList
- def removeNoiseByThresholdMax(inList,inThresholdMax):
- "根据最大阈值去除噪声数据去除该数据"
- for i in range(1, len(inList), 1):
- if float(inList[i]) > inThresholdMax:
- inList[i] = ''
- return inList
- def autoPaddingByAVG(inList):
- "利用均值补全缺失值"
- avg = getAVG(inList)
- for i in range(1, len(inList), 1):
- if inList[i] == '':
- inList[i] = str(avg)
- return inList
- def autoPaddingByMedian(inList):
- "利用中位数补全缺失值"
- avg = getQUANTILE(inList,0.5)
- for i in range(1, len(inList), 1):
- if inList[i] == '':
- inList[i] = str(avg)
- return inList
- def binningWidth(inList,width):
- "数据离散化:等宽分箱"
- dic = {}
- for i in range(1,len(inList)):
- dic[i] =float(inList[i])
- dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序
- dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对
- for varlo in dict:
- dictList.append(list(varlo))
- i = 0 # 用于记录每个箱开始位置
- j = 0 #用于记录每个箱结束位置
- innerList = []
- for i in range(0, len(dictList)):
- if dictList[i][1] - dictList[j][1] > width:
- avg = average(innerList)
- for k in range(j, i, 1):
- dictList[k][1] = avg
- innerList = []
- j = i
- innerList.append(dictList[i][1])
- if (i == len(dictList)-1):
- avg = average(innerList)
- for k in range(j, i, 1):
- dictList[k][1] = avg
- innerList = []
- dictList[i][1] = avg
- dic1 = {}
- for i in range(0, len(dictList)):
- dic1[dictList[i][0]] = dictList[i][1]
- ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse=False) # 先将列表按KEY排序
- for i in range(0, len(ad)):
- inList[i + 1] = ad[i][1]
- return inList
- def binningDeep(inList,deep1):
- "数据离散化:等频分箱"
- deep = deep1 -1
- dic = {}
- for i in range(1,len(inList)):
- dic[i] =float(inList[i])
- dict = sorted(dic.iteritems(), key=lambda d: d[1], reverse= False) # 先将列表按value排序
- dictList = [] # 将排序后元素赋值给一个列表,用于存储K-V对
- for varlo in dict:
- dictList.append(list(varlo))
- innerList = []
- for i in range(0,deep): #为了排除0的干扰,首先处理掉deep个元素
- innerList.append(dictList[i][1])
- for i in range(deep, len(dictList)):
- if i % deep == 0:
- avg = average(innerList)
- for j in range(i-deep,i):
- dictList[j][1] = avg
- innerList = []
- innerList.append(dictList[i][1])
- if i == len(dictList)-1:
- avg = average(innerList)
- for j in range((i+1)/deep*deep,i+1):
- dictList[j][1] = avg
- dic1 = {}
- for i in range(0, len(dictList)):
- dic1[dictList[i][0]] = dictList[i][1]
- ad = sorted(dic1.iteritems(), key=lambda d: d[0], reverse= False) # 先将列表按KEY排序
- for i in range(0,len(ad)):
- inList[i+1] = ad[i][1]
- return inList
- def oneHot(inList,Row):
- "对输入数据矩阵的某一列使用oneHot编码"
- rowList0 = getRow(inList,Row)
- rowHead = rowList0[0]
- rowList = []
- for i in range(1,len(rowList0)):
- rowList.append(rowList0[i])
- rowmsg = {}
- j = 0
- for i in rowList:
- if rowmsg.has_key(i):
- rowmsg[i] = rowmsg[i] + 1
- else:
- rowmsg[i] = 1
- for i in rowmsg.keys():
- addList = []
- addList.append(i)
- for j in rowList:
- if j == i:
- addList.append('1')
- else:
- addList.append('0')
- addRow(inList,addList)
- for i in inList:
- print i
- def minMax(inList):
- "最大最小归一化"
- innerList = []
- for i in range(1,len(inList)):
- if re.match(r'[0-9]+', inList[i]):
- innerList.append(float(inList[i]))
- maxvalue = max(innerList)
- minvalue = min(innerList)
- for i in range(1,len(inList)):
- if re.match(r'[0-9]+', inList[i]):
- a = (float(inList[i])-minvalue)/(maxvalue - minvalue)
- b = "%.4f" %a
- inList[i] = str(b)
- return inList
- def zScore(inList):
- "zScore归一化"
- print inList
- u = getAVG(inList)
- ave = getAVE(inList)
- stand = math.sqrt(ave)
- for i in range(1,len(inList)):
- if re.match(r'[0-9]+', inList[i]):
- a = (float(inList[i])-u)/stand
- b = "%.4f" % a
- inList[i] = str(b)
- return inList
- def similarityDistance(inList1,inList2,n):
- "距离相似度"
- sum = 0
- for i in range(1,len(inList1)):
- sum = sum + abs(float(inList1[i])-float(inList2[i])) ** n
- a = float(1)/2
- return pow(sum,a)
- def similaritySim(inList1,inList2):
- "余弦相似度计算"
- sum = 0
- for i in range(1,len(inList1)):
- sum = sum + float(inList1[i])*float(inList2[i])
- sum1 = 0
- sum2 = 0
- for i in range(1,len(inList1)):
- sum1 = sum1 + float(inList1[i])**2
- for i in range(1, len(inList2)):
- sum2 = sum2 + float(inList2[i]) ** 2
- return sum/(math.sqrt(sum1)*math.sqrt(sum2))
- fileInput = fileREAD("D:\\PythonWorkSpace\\ExternalFile\\train.csv","r")
- # #获得某一行数据
- # print getLine(fileInput,1)
- #
- # #获得某一列数据
- # print getRow(fileInput,0)
- # #设置某一行数据
- # print "设置前:"
- # print getLine(fileInput,1)
- # setLine(fileInput,getLine(fileInput,2),1)
- # print "设置后:"
- # print getLine(fileInput,1)
- # #设置某一列数据
- # print "设置前:"
- # print getRow(fileInput,1)
- # setRow(fileInput,getRow(fileInput,2),1)
- # print "设置后:"
- # print getRow(fileInput,1)
- # #均值
- # print getAVG(getRow(fileInput,9))
- # #方差
- # print getAVE(getRow(fileInput,9))
- # #分位数
- # print getQUANTILE(getRow(fileInput,9),0.5)
- # #噪声数据过滤1
- # print removeNoiseAuto(getRow(fileInput,1))
- #
- # #噪声数据过滤2
- # print removeNoiseByThresholdMin(getRow(fileInput,0),10)
- #
- # #噪声数据过滤3
- # print removeNoiseByThresholdMax(getRow(fileInput,0),10)
- # #缺失值补全1
- # print autoPaddingByAVG(getRow(fileInput,0))
- #
- # #缺失值补全2
- # print autoPaddingByMedian(getRow(fileInput,0))
- # #等宽分箱
- # print binningWidth(getRow(fileInput,0),3)
- #
- # #等频分箱
- # print binningDeep(getRow(fileInput,0),3)
- # #ONE-HOT编码
- # oneHot(fileInput,1)
- # for i in fileInput:
- # print i
- # #最大最小归一化
- # print minMax(getRow(fileInput,0))
- #
- # #zScore归一化
- # print zScore(getRow(fileInput,0))
- # #距离相似度
- # print similarityDistance(getRow(fileInput,0),getRow(fileInput,0),2)
- # # 余弦相似度计算
- # print similaritySim(getRow(fileInput,0),getRow(fileInput,1))
原文地址:http://blog.csdn.NET/u012155582/article/details/52051776
这篇关于python数据预处理练习的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!