本文主要是介绍python转换并提取pdf文件中的图片,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
#安装fitz包
pip install pymupdf
脚本如下所示:
import fitz
import re
import os
import time
import sysarguments = sys.argvfor arg in arguments:print(arg)def file_name_list(base_dir):for i, j, k in os.walk(base_dir):name = [i.replace('.pdf', '') for i in k]return namedef pdfExtractPic(filePath, pic_path):ret = {}try:iNum = 0with fitz.open(filePath) as doc:for page in range(len(doc)):for image in doc.get_page_images(page):xref = image[0]pix = fitz.Pixmap(doc, xref)iNum += 1fileName = ("%s\%d.png"%(pic_path, iNum))if pix.n < 5:pix.save(fileName)else:pix1 = fitz.Pixmap(fitz.csRGB, pix)pix1.save(fileName)pix1 = Noneexcept Exception as e:print(e)return retdef pdf2image(path, pic_path, image_name):try:with fitz.open(path) as pdf:for pg in range(0, pdf.page_count):page = pdf[pg]# 设置缩放和旋转系数,zoom_x, zoom_y取相同值,表示等比例缩放mat = fitz.Matrix(2, 2)pm = page.get_pixmap(matrix=mat, alpha=False)# if width or height > 2000 pixels, don't enlarge the imageif pm.width > 2000 or pm.height > 2000:pm = page.get_pixmap(matrix=fitz.Matrix(1, 1), alpha=False)page_num = pg + 1 # 页码从1开始new_name = '%s_img_%d.png' % (image_name, page_num)pm.save(os.path.join(pic_path, new_name)) except Exception as error:print(error) if __name__ == '__main__':if len(arguments) < 2:print("请输入pdf所在目录的路径.")sys.exit(1)base_dir = arguments[1]name_list = file_name_list(base_dir)start = time.time()for name in name_list:image_name = namefile_name = r'\%s.pdf' % name # PDF 文件路径file_path = base_dir + file_namepic_name = r'\pic\%s' % name # 存放图片的文件夹pic_path = base_dir + pic_nametry:os.makedirs(pic_path) # 根据路径,创建对应路径下的文件夹except Exception as error:print(error) pdf2image(file_path, pic_path, image_name)#print(pic_path)pic_path += "\\sub\\"try:os.makedirs(pic_path) # 根据路径,创建对应路径下的文件夹except Exception as error:print(error) pdfExtractPic(file_path, pic_path)end = time.time()print('task is over: %.2f' % (end-start))
如果执行的时候报错:
RuntimeError: Directory ‘static‘ does not exist
说明fitz安装错了版本,参考:PyMuPDF: AttributeError:模块‘fitz‘没有属性‘open’_attributeerror: module 'fitz' has no attribute 'op-CSDN博客
这篇关于python转换并提取pdf文件中的图片的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!