将caltech数据集转换成VOC格式

本文主要是介绍将caltech数据集转换成VOC格式，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

目的：将Caltech行人数据集转换为Pascal VOC格式

参考来源https://www.cnblogs.com/arkenstone/p/7337077.html 但是这里面的代码有一些问题，我在其中修改了一些
操作步骤如下：

将下载好的caltech pedestrian dataset解压，数据集下载地址，并按如下格式存放：（最好是按照下图的格式存放，不然容易报错）

运行程序如下：

import os, glob, argparse
import cv2
from scipy.io import loadmat
from collections import defaultdict
import numpy as np
from lxml import etree, objectify
"""
将caltech数据集格式转换成VOC格式，即.seq转换成.jpg; .vbb转换成xml
"""def vbb_anno2dict(vbb_file, cam_id, person_types=None):"""Parse caltech vbb annotation file to dictArgs:vbb_file: input vbb file pathcam_id: camera idperson_types: list of person type that will be used (total 4 types: person, person-fa, person?, people).If None, all will be used:Return:Annotation info dict with filename as key and anno info as value"""filename = os.path.splitext(os.path.basename(vbb_file))[0]annos = defaultdict(dict)vbb = loadmat(vbb_file)# object info in each frame: id, pos, occlusion, lock, posvobjLists = vbb['A'][0][0][1][0]objLbl = [str(v[0]) for v in vbb['A'][0][0][4][0]]# person indexif not person_types:person_types = ["person", "person-fa", "person?", "people"]person_index_list = [x for x in range(len(objLbl)) if objLbl[x] in person_types]for frame_id, obj in enumerate(objLists):if len(obj) > 0:frame_name = str(cam_id) + "_" + str(filename) + "_" + str(frame_id+1) + ".jpg"annos[frame_name] = defaultdict(list)annos[frame_name]["id"] = frame_namefor fid, pos, occl in zip(obj['id'][0], obj['pos'][0], obj['occl'][0]):fid = int(fid[0][0]) - 1  # for matlab start from 1 not 0if not fid in person_index_list:  # only use bbox whose label is given person typecontinueannos[frame_name]["label"] = objLbl[fid]pos = pos[0].tolist()occl = int(occl[0][0])annos[frame_name]["occlusion"].append(occl)annos[frame_name]["bbox"].append(pos)if not annos[frame_name]["bbox"]:del annos[frame_name]return annosdef seq2img(annos, seq_file, outdir, cam_id):"""Extract frames in seq files to given output directoriesArgs:annos: annos dict returned from parsed vbb fileseq_file: seq file pathoutdir: frame save dircam_id: camera idReturns:camera captured image size"""cap = cv2.VideoCapture(seq_file)index = 1# captured frame listv_id = os.path.splitext(os.path.basename(seq_file))[0]cap_frames_index = np.sort([int(os.path.splitext(id)[0].split("_")[2]) for id in annos.keys()])while True:ret, frame = cap.read()if ret:if not index in cap_frames_index:index += 1continueif not os.path.exists(outdir):os.makedirs(outdir)outname = os.path.join(outdir, str(cam_id)+"_"+v_id+"_"+str(index)+".jpg")print("Current frame: ", v_id, str(index))cv2.imwrite(outname, frame)height, width, _ = frame.shapeelse:breakindex += 1img_size = (width, height)return img_sizedef instance2xml_base(anno, img_size, bbox_type='xyxy'):"""Parse annotation data to VOC XML formatArgs:anno: annotation info returned by vbb_anno2dict functionimg_size: camera captured image sizebbox_type: bbox coordinate record format: xyxy (xmin, ymin, xmax, ymax); xywh (xmin, ymin, width, height)Returns:Annotation xml info tree"""assert bbox_type in ['xyxy', 'xywh']E = objectify.ElementMaker(annotate=False)anno_tree = E.annotation(E.folder('VOC2014_instance/person'),E.filename(anno['id']),E.source(E.database('Caltech pedestrian'),E.annotation('Caltech pedestrian'),E.image('Caltech pedestrian'),E.url('None')),E.size(E.width(img_size[0]),E.height(img_size[1]),E.depth(3)),E.segmented(0),)for index, bbox in enumerate(anno['bbox']):bbox = [float(x) for x in bbox]if bbox_type == 'xyxy':xmin, ymin, w, h = bboxxmax = xmin+wymax = ymin+helse:xmin, ymin, xmax, ymax = bboxxmin = int(xmin)ymin = int(ymin)xmax = int(xmax)ymax = int(ymax)if xmin < 0:xmin = 0if xmax > img_size[0] - 1:xmax = img_size[0] - 1if ymin < 0:ymin = 0if ymax > img_size[1] - 1:ymax = img_size[1] - 1if ymax <= ymin or xmax <= xmin:continueE = objectify.ElementMaker(annotate=False)anno_tree.append(E.object(E.name(anno['label']),E.bndbox(E.xmin(xmin),E.ymin(ymin),E.xmax(xmax),E.ymax(ymax)),E.difficult(0),E.occlusion(anno["occlusion"][index])))return anno_treedef parse_anno_file(vbb_inputdir, seq_inputdir, vbb_outputdir, seq_outputdir, person_types=None):"""Parse Caltech data stored in seq and vbb files to VOC xml formatArgs:vbb_inputdir: vbb file saved pthseq_inputdir: seq file saved pathvbb_outputdir: vbb data converted xml file saved pathseq_outputdir: seq data converted frame image file saved pathperson_types: list of person type that will be used (total 4 types: person, person-fa, person?, people).If None, all will be used:"""# annotation sub-directories in hda annotation input directoryassert os.path.exists(vbb_inputdir)sub_dirs = os.listdir(vbb_inputdir)for sub_dir in sub_dirs:print("Parsing annotations of camera: ", sub_dir)cam_id = sub_dirvbb_files = glob.glob(os.path.join(vbb_inputdir, sub_dir, "*.vbb"))for vbb_file in vbb_files:annos = vbb_anno2dict(vbb_file, cam_id, person_types=person_types)if annos:vbb_outdir = os.path.join(vbb_outputdir, "annotations", sub_dir, "bbox")# extract frames from seqseq_file = os.path.join(seq_inputdir, sub_dir, os.path.splitext(os.path.basename(vbb_file))[0]+".seq")seq_outdir = os.path.join(seq_outputdir, sub_dir, "frame")if not os.path.exists(vbb_outdir):os.makedirs(vbb_outdir)if not os.path.exists(seq_outdir):os.makedirs(seq_outdir)img_size = seq2img(annos, seq_file, seq_outdir, cam_id)for filename, anno in sorted(annos.items(), key=lambda x: x[0]):if "bbox" in anno:anno_tree = instance2xml_base(anno, img_size)outfile = os.path.join(vbb_outdir, os.path.splitext(filename)[0]+".xml")print("Generating annotation xml file of picture: ", filename)etree.ElementTree(anno_tree).write(outfile, pretty_print=True)def visualize_bbox(xml_file, img_file):import cv2tree = etree.parse(xml_file)# load imageimage = cv2.imread(img_file)# get bboxfor bbox in tree.xpath('//bndbox'):coord = []for corner in bbox.getchildren():coord.append(int(float(corner.text)))# draw rectangle# coord = [int(x) for x in coord]image = cv2.rectangle(image, (coord[0], coord[1]), (coord[2], coord[3]), (0, 0, 255), 2)# visualize imagecv2.imshow("test", image)cv2.waitKey(0)#注意！以下输入输出目录地址请参照上文示意图，确保seq文件被正确读入，不然可能会产生空文件夹或报错def main():# parser = argparse.ArgumentParser()# parser.add_argument("seq_dir", default="C:\\workspace\\data\\images", help="Caltech dataset seq data root directory")# parser.add_argument("vbb_dir", default="C:\\workspace\\data\\annotations", help="Caltech dataset vbb data root directory")# parser.add_argument("output_dir",default="C:\\workspace\\data\\VOCdevkit2007\\Caltech", help="Root saving path for frame and annotation files")# # parser.add_argument("person_type", default="person", type=str, help="Person type extracted within 4 options: "# #                                                   "'person', 'person-fa', 'person?', 'people'. If multiple type used,"# #                                                   "separated with comma",# #                     choices=["person", "person-fa", "person?", "people"])# parser.add_argument("person_type", default="person", type=str)# args = parser.parse_args()outdir = "/VOCdevkit2007"#outdir = args.output_dirframe_out = os.path.join(outdir, "frame")anno_out = os.path.join(outdir, "annotation")# person_type = args.person_type.split(",")person_type = "person"seq_dir = "/Caltech"vbb_dir = "/Caltech/annotations"parse_anno_file(vbb_dir, seq_dir, anno_out, frame_out, person_type)print("Generating done!")
#
# def test():
#     seq_inputdir = "/startdt_data/caltech_pedestrian_dataset"
#     vbb_inputdir = "/startdt_data/caltech_pedestrian_dataset/annotations"
#     seq_outputdir = "/startdt_data/caltech_pedestrian_dataset/test"
#     vbb_outputdir = "/startdt_data/caltech_pedestrian_dataset/test"
#     person_types = ["person"]
#     parse_anno_file(vbb_inputdir, seq_inputdir, vbb_outputdir, seq_outputdir, person_types=person_types)
#
#     # xml_file = "/startdt_data/caltech_pedestrian_dataset/annotations/set00/bbox/set00_V013_1511.xml"
#     # img_file = "/startdt_data/caltech_pedestrian_dataset/set00/frame/set00_V013_1511.jpg"
#     # visualize_bbox(xml_file, img_file)if __name__ == "__main__":main()

出现的一个错误：UnboundLocalError: local variable 'width' referenced before assignment

原因：.seq文件没有读到导致的，要检查一下输入目录https://github.com/CasiaFan/Dataset_to_VOC_converter/issues/1

有一个问题值得注意：caltech中与行人有关的类有四种：person_types = ["person", "person-fa", "person?", "people"]

如果不特别设置，直接用一些现成的代码去转换.seq文件为.jpg是把所有frame都截取出来，而一些代码中转换vbb文件为xml文件时却只把person类的标注出来，就会导致images和annotation最后的总数不一致，如：https://blog.csdn.net/qq_33297776/article/details/79869813

我参考的代码设置了.seq和.vbb均只截取person类，所以数量一致

这篇关于将caltech数据集转换成VOC格式的文章就介绍到这儿，希望我们推荐的文章对编程师们有所帮助！