本文主要是介绍Python爬虫案例四:爬取某个博主的所有文章保存成PDF格式,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
引入(将图片保存成PDF格式):
测试链接: https://zq.bookan.com.cn/?t=detail&id=21088&ct=1&is=31042341&rid=4658(图书馆图片保存PDF),前提是装库,pip install img2pdf 具体步骤: import requests, img2pdf url_list = [ 'http://img1-qn.bookan.com.cn/page8/3234/3234-310411164/4214b5ac_big.mg', 'http://img1-qn.bookan.com.cn/page8/8314/8314-310441286/8522073f_big.mg' ] data_list = [requests.get(url).content for url in url_list] # 1、准备宽度 + 高度 width = img2pdf.mm_to_pt(300) height = img2pdf.mm_to_pt(300) # 2、准备空白的PDF页面 pdf_size = img2pdf.get_layout_fun((width, height)) // 版图 布局,此时PDF是空的 # 3、添加数据 (img的数据, pdf的size) pdf_data = img2pdf.convert(data_list, layout_fun=pdf_size) # 4、保存 with open('测试.pdf', 'wb') as f: f.write(pdf_data) print('ok') 测试结果:
案例实战: 抓取CSDN某位博主的文章并将其保存成PDF格式(先抓取一篇然后批量)
源码:
# 爬虫部分 ====> 代码可复用 import requests,parsel, pdfkit from lxml import etreedef get_all():# 批量下载博主的全部文章 ==> 取列表页获取所有的urlinfo_url = 'https://blog.csdn.net/2301_80014606'cookies = {'uuid_tt_dd': '10_10174532770-1711812940092-685570','Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D','cf_clearance': 'LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw','UserName': 'm0_74614835','UserInfo': '8268477d59434a4194615c67cd9ea26d','UserToken': '8268477d59434a4194615c67cd9ea26d','UserNick': 'm0_74614835','AU': '23E','UN': 'm0_74614835','BT': '1718679492185','p_uid': 'U010000','m0_74614835comment_new': '1720492338243','firstDie': '1','Hm_lvt_ec8a58cd84a81850bcbd95ef89524721': '1720707980,1721100768,1721197835,1721446756','c_dl_fref': 'https://www.iteye.com/','c_utm_source': 'iteye','c_dl_prid': '1721446933332_253428','c_dl_rid': '1721446989577_111570','c_dl_fpage': '/download/Programmer_FuQiang/12187394','c_dl_um': 'distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a','c_segment': '2','Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721100769,1721197835,1721446756,1721521897','HMACCOUNT': '66A8254591DC78E3','https_waf_cookie': '50bed8ef-937c-4dd73e2026e8fe2a1adf6adc04dac5006194','dc_sid': '167e0952bfbaec891be59557bcfa7617','_clck': '16zlski%7C2%7Cfnn%7C0%7C1550','csrfToken': 'SmchSM7MO-KHW5qJeFuboz29','__gads': 'ID=32681eea0b3f2dc4:T=1711812942:RT=1721525715:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w','__gpi': 'UID=00000d7858e98a50:T=1711812942:RT=1721525715:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg','__eoi': 'ID=854a41bc034210ac:T=1711812942:RT=1721525715:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q','c_first_ref': 'default','c_utm_medium': 'distribute.pc_feed_blog_category.none-task-blog-classify_tag-3-139705227-null-null.nonecase','dc_session_id': '10_1721527944582.357468','c_pref': 'default','creativeSetApiNew': '%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D','log_Id_click': '144','waf_captcha_marker': '2ad0ccbd0b616ff0b7f0ead6dcb53d42a740df830a745a7366d633b88332b078','c_ref': 'default','c_first_page': 'https%3A//blog.csdn.net/2301_80014606%3Ftype%3Dblog','c_dsid': '11_1721528107847.945530','c_page_id': 'default','log_Id_pv': '111','Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721528111','log_Id_view': '3535','_clsk': '112h5bx%7C1721528118664%7C1%7C0%7Cy.clarity.ms%2Fcollect','dc_tos': 'sgybm1',}headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','Accept-Language': 'zh-CN,zh;q=0.9','Cache-Control': 'max-age=0','Connection': 'keep-alive',# 'Cookie': 'uuid_tt_dd=10_10174532770-1711812940092-685570; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; cf_clearance=LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw; UserName=m0_74614835; UserInfo=8268477d59434a4194615c67cd9ea26d; UserToken=8268477d59434a4194615c67cd9ea26d; UserNick=m0_74614835; AU=23E; UN=m0_74614835; BT=1718679492185; p_uid=U010000; m0_74614835comment_new=1720492338243; firstDie=1; Hm_lvt_ec8a58cd84a81850bcbd95ef89524721=1720707980,1721100768,1721197835,1721446756; c_dl_fref=https://www.iteye.com/; c_utm_source=iteye; c_dl_prid=1721446933332_253428; c_dl_rid=1721446989577_111570; c_dl_fpage=/download/Programmer_FuQiang/12187394; c_dl_um=distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a; c_segment=2; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1721100769,1721197835,1721446756,1721521897; HMACCOUNT=66A8254591DC78E3; https_waf_cookie=50bed8ef-937c-4dd73e2026e8fe2a1adf6adc04dac5006194; dc_sid=167e0952bfbaec891be59557bcfa7617; _clck=16zlski%7C2%7Cfnn%7C0%7C1550; csrfToken=SmchSM7MO-KHW5qJeFuboz29; __gads=ID=32681eea0b3f2dc4:T=1711812942:RT=1721525715:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w; __gpi=UID=00000d7858e98a50:T=1711812942:RT=1721525715:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg; __eoi=ID=854a41bc034210ac:T=1711812942:RT=1721525715:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q; c_first_ref=default; c_utm_medium=distribute.pc_feed_blog_category.none-task-blog-classify_tag-3-139705227-null-null.nonecase; dc_session_id=10_1721527944582.357468; c_pref=default; creativeSetApiNew=%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D; log_Id_click=144; waf_captcha_marker=2ad0ccbd0b616ff0b7f0ead6dcb53d42a740df830a745a7366d633b88332b078; c_ref=default; c_first_page=https%3A//blog.csdn.net/2301_80014606%3Ftype%3Dblog; c_dsid=11_1721528107847.945530; c_page_id=default; log_Id_pv=111; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1721528111; log_Id_view=3535; _clsk=112h5bx%7C1721528118664%7C1%7C0%7Cy.clarity.ms%2Fcollect; dc_tos=sgybm1','Sec-Fetch-Dest': 'document','Sec-Fetch-Mode': 'navigate','Sec-Fetch-Site': 'none','Sec-Fetch-User': '?1','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36','sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"',}params = {'type': 'blog',}response = requests.get(info_url, params=params, cookies=cookies, headers=headers).textA = etree.HTML(response)url_list = A.xpath('//article[@class="blog-list-box"]/a/@href')for url in url_list:get_one(url)def get_one(url): # 下载博客的某一个文章cookies = {'uuid_tt_dd': '10_10174532770-1711812940092-685570','Hm_up_6bcd52f51e9b3dce32bec4a3997715ac': '%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D','cf_clearance': 'LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw','UserName': 'm0_74614835','UserInfo': '8268477d59434a4194615c67cd9ea26d','UserToken': '8268477d59434a4194615c67cd9ea26d','UserNick': 'm0_74614835','AU': '23E','UN': 'm0_74614835','BT': '1718679492185','p_uid': 'U010000','m0_74614835comment_new': '1720492338243','c_segment': '2','firstDie': '1','Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac': '1720707980,1721100769,1721197835,1721446756','HMACCOUNT': '66A8254591DC78E3','Hm_lvt_ec8a58cd84a81850bcbd95ef89524721': '1720707980,1721100768,1721197835,1721446756','Hm_lpvt_ec8a58cd84a81850bcbd95ef89524721': '1721446756','dc_sid': '37a800e065792e9d25758f94942b724b','c_dl_fref': 'https://www.iteye.com/','c_utm_source': 'iteye','_clck': '16zlski%7C2%7Cfnm%7C0%7C1550','c_dl_prid': '1721446933332_253428','c_dl_rid': '1721446989577_111570','c_dl_fpage': '/download/Programmer_FuQiang/12187394','c_dl_um': 'distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a','dc_session_id': '10_1721489180573.741111','c_first_ref': 'www.baidu.com','c_first_page': 'https%3A//www.csdn.net/','c_dsid': '11_1721489179823.492641','creativeSetApiNew': '%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D','https_waf_cookie': 'b0c0b344-8c52-498bc5a67d7393bbce036bff877d4003edec','log_Id_click': '131','c_pref': 'https%3A//www.csdn.net/','c_ref': 'https%3A//i.csdn.net/','c_page_id': 'default','log_Id_pv': '93','Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac': '1721489207','__gads': 'ID=32681eea0b3f2dc4:T=1711812942:RT=1721489211:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w','__gpi': 'UID=00000d7858e98a50:T=1711812942:RT=1721489211:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg','__eoi': 'ID=854a41bc034210ac:T=1711812942:RT=1721489211:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q','_clsk': '14eo8p7%7C1721489210658%7C1%7C0%7Cx.clarity.ms%2Fcollect','log_Id_view': '2833','waf_captcha_marker': '9c2b63af339e16b26460039c73324b937b80bb04c7836dd3ef90ded0e605b8fe','dc_tos': 'sgxhl0',}headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7','Accept-Language': 'zh-CN,zh;q=0.9','Cache-Control': 'max-age=0','Connection': 'keep-alive',# 'Cookie': 'uuid_tt_dd=10_10174532770-1711812940092-685570; Hm_up_6bcd52f51e9b3dce32bec4a3997715ac=%7B%22islogin%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isonline%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%2C%22isvip%22%3A%7B%22value%22%3A%220%22%2C%22scope%22%3A1%7D%7D; cf_clearance=LEjKw1U8FYwFrjTgFMgFQzHVdqSloAwUhWOl5yCq_Og-1716465784-1.0.1.1-G0ycyrRoC9DCMQtiQoceRTB4oeUXPKSRASfv7PAxGkqfQk8p5RJVwkCd5NiS7rXLxchn0FxjrNqhStHM_OtkWw; UserName=m0_74614835; UserInfo=8268477d59434a4194615c67cd9ea26d; UserToken=8268477d59434a4194615c67cd9ea26d; UserNick=m0_74614835; AU=23E; UN=m0_74614835; BT=1718679492185; p_uid=U010000; m0_74614835comment_new=1720492338243; c_segment=2; firstDie=1; Hm_lvt_6bcd52f51e9b3dce32bec4a3997715ac=1720707980,1721100769,1721197835,1721446756; HMACCOUNT=66A8254591DC78E3; Hm_lvt_ec8a58cd84a81850bcbd95ef89524721=1720707980,1721100768,1721197835,1721446756; Hm_lpvt_ec8a58cd84a81850bcbd95ef89524721=1721446756; dc_sid=37a800e065792e9d25758f94942b724b; c_dl_fref=https://www.iteye.com/; c_utm_source=iteye; _clck=16zlski%7C2%7Cfnm%7C0%7C1550; c_dl_prid=1721446933332_253428; c_dl_rid=1721446989577_111570; c_dl_fpage=/download/Programmer_FuQiang/12187394; c_dl_um=distribute.pc_relevant_download.none-task-download-2%7Edefault%7Ekeyword%7ERate-6-12187394-download-18698164.257%5Ev16%5Epc_dl_relevant_base1_a; dc_session_id=10_1721489180573.741111; c_first_ref=www.baidu.com; c_first_page=https%3A//www.csdn.net/; c_dsid=11_1721489179823.492641; creativeSetApiNew=%7B%22toolbarImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20231011044944.png%22%2C%22publishSuccessImg%22%3A%22https%3A//img-home.csdnimg.cn/images/20240229024608.png%22%2C%22articleNum%22%3A0%2C%22type%22%3A0%2C%22oldUser%22%3Afalse%2C%22useSeven%22%3Atrue%2C%22oldFullVersion%22%3Afalse%2C%22userName%22%3A%22m0_74614835%22%7D; https_waf_cookie=b0c0b344-8c52-498bc5a67d7393bbce036bff877d4003edec; log_Id_click=131; c_pref=https%3A//www.csdn.net/; c_ref=https%3A//i.csdn.net/; c_page_id=default; log_Id_pv=93; Hm_lpvt_6bcd52f51e9b3dce32bec4a3997715ac=1721489207; __gads=ID=32681eea0b3f2dc4:T=1711812942:RT=1721489211:S=ALNI_MZOcGfXX7k9EPNVXFUB9vOn-7JQ4w; __gpi=UID=00000d7858e98a50:T=1711812942:RT=1721489211:S=ALNI_MZrVxnKqRLfUnUvHdPxcMdkGTiYUg; __eoi=ID=854a41bc034210ac:T=1711812942:RT=1721489211:S=AA-AfjbYkMhl8MU1tq93LJL52Q-q; _clsk=14eo8p7%7C1721489210658%7C1%7C0%7Cx.clarity.ms%2Fcollect; log_Id_view=2833; waf_captcha_marker=9c2b63af339e16b26460039c73324b937b80bb04c7836dd3ef90ded0e605b8fe; dc_tos=sgxhl0','Sec-Fetch-Dest': 'document','Sec-Fetch-Mode': 'navigate','Sec-Fetch-Site': 'none','Sec-Fetch-User': '?1','Upgrade-Insecure-Requests': '1','User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36','sec-ch-ua': '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"','sec-ch-ua-mobile': '?0','sec-ch-ua-platform': '"Windows"',}response = requests.get(url, cookies=cookies, headers=headers).text # print(response) # 将抓取到的文本单独保存成html文件然后进行解析# ------xpath取标题-----------A = etree.HTML(response)title = A.xpath('//h1/text()')[0] # 标题是h1标签,不一定全是title标签 # --------css取正文-----------B = parsel.Selector(response)data = B.css('#content_views').get()html_data = \'''<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8"><title>Title</title></head><body>{}</body></html>'''.format(data)with open('csdn正文.html', 'w', encoding='utf-8') as f:f.writelines(html_data)To_pdf(title)def To_pdf(title): # 转化PDF 引入工具 -- 进行转换kit = pdfkit.configuration(wkhtmltopdf=r'E:\wkhtmltopdf\bin\wkhtmltopdf.exe')pdfkit.from_file('csdn正文.html', f'{title}.pdf', configuration=kit)print('保存OK--{}'.format(title))def main():get_all()if __name__ == '__main__':main()
运行效果(只列举其中一篇文章):
这篇关于Python爬虫案例四:爬取某个博主的所有文章保存成PDF格式的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!