本文主要是介绍爬取斗图网页前五页的图片,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
单线程爬取
import requests
from lxml import etree
import os
import time
def tupi_url():for tape in range(1, 4):url = f'https://www.pkdoutu.com/article/list/?page={tape}'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'}response = requests.get(url, headers=headers)tree = etree.HTML(response.text)img_urls = tree.xpath('//a[@class="list-group-item random_list"]//img/@data-original')for img_url in img_urls:xiazai(img_url)def xiazai(url):folder_path = '斗图'if not os.path.exists(folder_path):os.makedirs(folder_path)headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}resp = requests.get(url, headers=headers)content = resp.content# 从URL中提取文件名file_name = url.split('/')[-1]# 构造文件路径file_path = os.path.join(folder_path, file_name)with open(file_path, 'wb') as f:f.write(content)print(f"Downloaded {file_name} successfully.")if __name__ == '__main__':start_time = time.time()tupi_url()stop_time = time.time()stop_time = stop_time - start_timeprint(stop_time)
多线程爬取
import requests
from lxml import etree
import os
import time
from concurrent.futures import ThreadPoolExecutordef tupi_url():img_urls = []for tape in range(1, 4):url = f'https://www.pkdoutu.com/article/list/?page={tape}'headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/122.0.0.0 Safari/537.36 Edg/122.0.0.0'}response = requests.get(url, headers=headers)tree = etree.HTML(response.text)img_urls.extend(tree.xpath('//a[@class="list-group-item random_list"]//img/@data-original'))return img_urlsdef xiazai(url):folder_path = '斗图'if not os.path.exists(folder_path):os.makedirs(folder_path)headers = {"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/100.0.4896.75 Safari/537.36"}resp = requests.get(url, headers=headers)content = resp.content# 从URL中提取文件名file_name = url.split('/')[-1]# 构造文件路径file_path = os.path.join(folder_path, file_name)with open(file_path, 'wb') as f:f.write(content)print(f"Downloaded {file_name} successfully.")if __name__ == '__main__':start_time = time.time()img_urls = tupi_url()with ThreadPoolExecutor(max_workers=32) as executor:executor.map(xiazai, img_urls)stop_time = time.time()print(f"Total time taken: {stop_time - start_time} seconds")
这篇关于爬取斗图网页前五页的图片的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!