使用Python整理数据集，规范化数据

本文主要是介绍使用Python整理数据集，规范化数据，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

写在前面的话：经过大量的采图实验，数据散落各处，设备也调整过，标定参数之类的也都不一致，以前整理数据都是用的C/C++傻乎乎地system(“cp str str”)虽然知道shell更简单，但是毕竟懒，现在发现PY真是一把利器。

1.0按照指定的文件路径组织文件

已知部分（多数数据）数据目录结构如下：

用下面的方式整理实验数据，写完这个也可以当是PY学习的小例子：

import os
import shutildirroot = "O:\\380A"
dir1 = [d for d in os.listdir(dirroot) if os.path.isdir(os.path.join(dirroot, d))]
dir1 = [os.path.join(dirroot, d) for d in dir1]
dir1 = [os.path.join(d, "1") for d in dir1]
dirs = []
for d in dir1:dirs.extend( os.path.join(d, dd) for dd in os.listdir(d))
dirtmp = dirs.copy()
dirs.clear()
for d in dirtmp:dirs.extend(os.path.join(d, dd) for dd in os.listdir(d))tardir = "O:\\chengdudongdataset\\380Adata"
calibdir = "O:\\chengdudongdataset\\calib\\chengdudongIU20181122bd"
cooresponding = []
error_s = []
for index, d in enumerate(dirs):if not os.path.isdir(d):continuedir_d = os.path.join(tardir, ("%05d" % index))os.mkdir(dir_d)##vector<string> robot1{ "1-r.jpg","1-t.jpg","2-r.jpg", "2-t.jpg","1-g.jpg", "2-g.jpg","2d.jpg"};#vector<string> robot2{ "3-r.jpg","3-t.jpg","4-r.jpg", "4-t.jpg","3-g.jpg", "4-g.jpg","2d.jpg"};robot1_b = os.path.exists(os.path.join(d, "1-r.jpg"))robot2_b = os.path.exists(os.path.join(d, "3-r.jpg"))try:# FileNotFoundError: [Errno 2] No such file or directory: '# O:\\380A\\20190311-081911 (CRH380A-2867(B)) {10560529268737}# \\1\\1\\01.04.15.23.1-2\\1-t.jpg'if robot1_b:shutil.copyfile(os.path.join(d, "1-r.jpg"), os.path.join(dir_d, "img2r.jpg"))shutil.copyfile(os.path.join(d, "1-t.jpg"), os.path.join(dir_d, "img2t.jpg"))shutil.copyfile(os.path.join(d, "1-g.jpg"), os.path.join(dir_d, "img2g.jpg"))shutil.copyfile(os.path.join(d, "2-r.jpg"), os.path.join(dir_d, "img1r.jpg"))shutil.copyfile(os.path.join(d, "2-t.jpg"), os.path.join(dir_d, "img1t.jpg"))shutil.copyfile(os.path.join(d, "2-g.jpg"), os.path.join(dir_d, "img1g.jpg"))shutil.copyfile(os.path.join(d, "2d.jpg"), os.path.join(dir_d, "imgcom.jpg"))shutil.copyfile(os.path.join(calibdir, "para_stero_1_2.xml"), os.path.join(dir_d, "para_stereo.xml"))shutil.copyfile(os.path.join(calibdir, "para1.xml"), os.path.join(dir_d, "cam1.xml"))shutil.copyfile(os.path.join(calibdir, "para2.xml"), os.path.join(dir_d, "cam2.xml"))elif robot2_b:shutil.copyfile(os.path.join(d, "3-r.jpg"), os.path.join(dir_d, "img2r.jpg"))shutil.copyfile(os.path.join(d, "3-t.jpg"), os.path.join(dir_d, "img2t.jpg"))shutil.copyfile(os.path.join(d, "3-g.jpg"), os.path.join(dir_d, "img2g.jpg"))shutil.copyfile(os.path.join(d, "4-r.jpg"), os.path.join(dir_d, "img1r.jpg"))shutil.copyfile(os.path.join(d, "4-t.jpg"), os.path.join(dir_d, "img1t.jpg"))shutil.copyfile(os.path.join(d, "4-g.jpg"), os.path.join(dir_d, "img1g.jpg"))shutil.copyfile(os.path.join(d, "2d.jpg"), os.path.join(dir_d, "imgcom.jpg"))shutil.copyfile(os.path.join(calibdir, "para_stero_3_4.xml"), os.path.join(dir_d, "para_stereo.xml"))shutil.copyfile(os.path.join(calibdir, "para3.xml"), os.path.join(dir_d, "cam1.xml"))shutil.copyfile(os.path.join(calibdir, "para4.xml"), os.path.join(dir_d, "cam2.xml"))else:passcoor = (("%05d" % index)+'\t'+d+'\n')cooresponding.append(coor)except:coor = (("%05d" % index)+'\t'+d+'\n')error_s.append(coor)
cooresponding[-1] = cooresponding[-1][:-1]with open(os.path.join(tardir, "cooresponding.txt"), 'w') as file_obj:for coor in cooresponding:file_obj.write(coor)if len(error_s):error_s[-1] = error_s[-1][:-1]with open(os.path.join(tardir, "error_s.txt"), 'w') as file_error_obj:for coor in error_s:file_error_obj.write(coor)

整理完就是：
整理后

2.0 制作简单GUI工具筛选数据。使用tkinter

先设计一个大概的应用界面：
GUI设计
这是最终完成的样子：
最终设计结果
Code如下：通过这个例子就可以基本摸清楚tkinter的套路了，和HTML，Android XML这些常见GUI很类似
纠正：做标记时候建议用：1、2、4、8、来做标记，因为这样方便位运算。我用了1/2/3来做标记也可以用，但是不建议这样做

# -*- coding: utf-8 -*-
"""
Created on Wed Jul 17 16:57:48 2019@author: frank
"""
import os
import tkinter as tk
from PIL import Image, ImageTk
window = tk.Tk()
window.title('西南交大光电工程研究所数据分类工具_ClassifyTo3 - - - Quasimo')
window.geometry('1000x730')# 撤销操作的标记，这个标记用于记录上一次的按键操作是啥，所有的操作会被记录进optList,
# optList的标记分别对应dirs的每个目录的数据是什么# 各个标记的定义
TRAIN_CLASS = 1
NOT_WELL_CLASS = 2
TOO_MUCH_NOISE = 3
SPECIAL_PART = 8
# 当前图像的标记
optAndMark = 0dirs = []global dirRoot
dirRoot = r'D:\chengdudongdataset\stereodata'
with open(os.path.join(dirRoot, "dirList_windows.txt"), 'r') as file_obj:dirs = file_obj.readlines()
for idx, d in enumerate(dirs):dirs[idx] = dirs[idx].rstrip()trainDirList = []
notVeryWell = []
tooMuchNoise = []
specialPart = []
global optMarks, isWheelFlag
optMarks = ['0']*len(dirs)
isWheelFlag = Falseglobal indexOfDirs
indexOfDirs = 0
global dispImg
global img_show
pathBingLookVar = tk.StringVar()
pathBingLookVar.set('this is path')if os.path.exists(os.path.join(dirRoot, "optMarks.txt")):with open(os.path.join(dirRoot, "optMarks.txt"), 'r') as file_obj:optMarks = file_obj.readlines()for idx, d in enumerate(optMarks):optMarks[idx] = optMarks[idx].rstrip()
indexOfDirs = optMarks.index('0')def openImgAndShow():bkgimg = Image.open(os.path.join(dirs[indexOfDirs], 'dispRainbowBlack.png'))pathBingLookVar.set(dirs[indexOfDirs])global dispImgdispImg = ImageTk.PhotoImage(bkgimg)global img_showimg_show.configure(image=dispImg)progressbarVar.set(str(indexOfDirs) +' / ' + str(len(dirs)))#canvas = tk.Canvas(frame2, bg='green', height=540, width=960)#image = canvas.create_image(0, 0, anchor='NW',image=image_file) def saveBtFun():#保存当前进度global dirRootglobal optMarkswith open(os.path.join(dirRoot, "optMarks.txt"), 'w') as file_obj:for recor in optMarks:file_obj.writelines(recor+"\n")returndef revocationBtFun():#撤销一步global indexOfDirsindexOfDirs -= 1global optMarks,isWheelFlagoptMarks[indexOfDirs] = str(0)isWheelFlag = FalseopenImgAndShow()returndef wheelSurfBtFun():# 标记这是 : 车轮踏面global isWheelFlagisWheelFlag = SPECIAL_PARTreturndef tooMuchNoiseBtFun():# 标记这个 : 太多噪声global indexOfDirs,dirsglobal optMarks,isWheelFlagoptMarks[indexOfDirs] = str(TOO_MUCH_NOISE | isWheelFlag)isWheelFlag = FalseindexOfDirs += 1if indexOfDirs == len(dirs):saveBtFun()exit()openImgAndShow()returndef notVeryWellBtFun():# 标记这个 : 不是很好global indexOfDirsglobal optMarks,isWheelFlagoptMarks[indexOfDirs] = str(NOT_WELL_CLASS | isWheelFlag)isWheelFlag = FalseindexOfDirs += 1if indexOfDirs == len(dirs):saveBtFun()exit()openImgAndShow()return
global switchFlag
switchFlag = True
def switchPic():# 查看一下实际图，global switchFlagglobal img_showglobal dispImgif switchFlag:bkgimg = Image.open(os.path.join(dirs[indexOfDirs], 'img1tCV8UC3.png'))dispImg = ImageTk.PhotoImage(bkgimg)img_show.configure(image=dispImg)switchFlag = not switchFlagelse:bkgimg = Image.open(os.path.join(dirs[indexOfDirs], 'dispRainbowBlack.png'))dispImg = ImageTk.PhotoImage(bkgimg)img_show.configure(image=dispImg)switchFlag = not switchFlagreturn
def trainDataBtFun():# 标记这个 : 数据可用global indexOfDirsglobal optMarks,isWheelFlagoptMarks[indexOfDirs] = str(TRAIN_CLASS | isWheelFlag)isWheelFlag = FalseindexOfDirs += 1if indexOfDirs == len(dirs):saveBtFun()exit()openImgAndShow()returnframe = window
frame1 = tk.Frame(master=frame, bg='#000fff000', borderwidth=10)
frame2 = tk.Frame(master=frame, borderwidth=10)
frame3 = tk.Frame(master=frame, bg='red', borderwidth=10)
frame4 = tk.Frame(master=frame, borderwidth=10)#progressbar = tk.Scale(window)#不用canvas了pathBingLook = tk.Label(master=frame1,font=('Arial', 12), width =35, textvariable = pathBingLookVar).pack(side='left', fill='x',expand='yes')
progressbarVar = tk.StringVar()
progressbarVar.set('step / Num');
progressbar = tk.Label(master=frame1, bg='red', font=('Arial', 12), width =15, textvariable = progressbarVar).pack(side='left')
frame1.pack(side='top', fill='both',expand='NO')bkgimg = Image.open(os.path.join(dirs[indexOfDirs], 'dispRainbowBlack.png'))
global dispImg
dispImg = ImageTk.PhotoImage(bkgimg) 
img_show = tk.Label(master=frame2, image = dispImg,height=540,width=960)
img_show.pack()
#canvas = tk.Canvas(frame2, bg='green', height=540, width=960)
#image = canvas.create_image(0, 0, anchor='NW',image=image_file) 
frame2.pack(side='top', fill='both')saveBt = tk.Button(frame3, text = '保存进度',font=('宋体', 17), command=saveBtFun).pack( side='left', anchor='center', expand='YES')
lookLook = tk.Button(frame3, text = 'LookLook',font=('宋体', 17), command=switchPic).pack( side='left', anchor='center', expand='YES')
revocationBt = tk.Button(frame3, text = '撤销一步',font=('宋体', 17), command=revocationBtFun).pack( side='left', anchor='center', expand='YES')
wheelSurfBt = tk.Button(frame3, text = '车轮踏面',font=('宋体', 17), command=wheelSurfBtFun).pack( side='left', anchor='center', expand='YES')
frame3.pack(side='top', fill='both',expand='NO')tooMuchNoiseBt = tk.Button(frame4, text = '不能用',font=('宋体', 17), command=tooMuchNoiseBtFun).pack(side='left', anchor='center', expand='YES')
notVeryWellBt = tk.Button(frame4, text = '还可以',font=('宋体', 17), command=notVeryWellBtFun).pack(side='left', anchor='center', expand='YES')
trainDataBt = tk.Button(frame4, text = '可以用',font=('宋体', 17), command=trainDataBtFun).pack(side='left', anchor='center', expand='YES')
frame4.pack(side='top', fill='both',expand='NO')window.mainloop()