本文主要是介绍爬虫—美食天下各类菜谱的菜单信息,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
文章仅用于学习,请勿肆意的爬取网站信息,作为非法用途
文章仅用于学习,请勿肆意的爬取网站信息,作为非法用途
文章仅用于学习,请勿肆意的爬取网站信息,作为非法用途
效果图
源码
import csv
import os
import randomimport requests
from lxml import etree# 请求头
head = ['Mozilla/5.0', 'Chrome/78.0.3904.97', 'Safari/537.36']
headers = {'user-agent': head[random.randint(0, 2)]
}def makedir(path):path = path.strip()# 去除尾部 \ 符号path = path.rstrip("\\")if not os.path.exists(path):os.mkdir(path)return Trueelse:return Falsedef getHtml(url):try:response = requests.request("GET", url=url, headers=headers)if response.status_code == 200:response.encoding = 'utf-8'return response.textelse:return response.status_codeexcept Exception as e:return edef htmlToTree(html):return etree.HTML(html)def elementToString(element):return etree.tostring(element, pretty_print=True, encoding='utf-8').decode('utf-8')def parseHtml(html):allcategory = []tree = htmlToTree(html)path = '//div[@class="wrap"]//div[@class="category_box mt20"]/div[@class="category_sub clear"]'data = tree.xpath(path)for item in data:category_list = []div = elementToString(item)div_tree = htmlToTree(div)category = div_tree.xpath('//h3/text()')[0]category_name = div_tree.xpath('//ul/li/a/text()')category_url = div_tree.xpath('//ul/li/a/@href')category_list.append(category)category_list.append(dict(zip(category_name, category_url)))allcategory.append(category_list)return allcategorydef writerCsv(food_list, category_name, name):try:title = [key for key in food_list[0].keys()] # 表头的代码去掉if '/' in name or category_name:name = name.replace('/', '')category_name = category_name.replace('/', '')path = os.path.join(os.getcwd() + '\\source\\{0}'.format(category_name))flag = makedir(path)csvdata = open('source/{0}/{1}.csv'.format(category_name, name), 'a', encoding='utf-8')dictwriter = csv.DictWriter(csvdata, fieldnames=title)dictwriter.writeheader()# for data in food_dict:# dictWriter.writerow(data)dictwriter.writerows(food_list)csvdata.close()return Trueexcept Exception as e:return Falsedef parseRecipe(html, category_name, name):tree = htmlToTree(html)path = '//div[@class="wrap"]//div[@id="J_list"]/ul/li'data = tree.xpath(path)food_list = []if data is not None and data != []:for item in data[0:1]:food_dict = {}li = elementToString(item)li_tree = htmlToTree(li)food_name = li_tree.xpath('//div[@class="pic"]/a/@title')[0]food_detailUrl = li_tree.xpath('//div[@class="pic"]/a/@href')[0]food_content = str(li_tree.xpath('//div[@class="detail"]/p[@class="subcontent"]/text()')[0])[3:-1]food_dict["菜名"] = food_namefood_dict["详情链接"] = food_detailUrlfood_dict["原料"] = food_contentfood_list.append(food_dict)res = writerCsv(food_list, category_name, name)return resdef getRecipePerCate(data):res = bool()for items in data[:4]:category_name = items[0]for name, url in items[1].items():html = getHtml(url)res = parseRecipe(html, category_name, name)if res:return "写入成功"else:return "写入失败"if __name__ == '__main__':url = 'https://home.meishichina.com/recipe-type.html'data = parseHtml(getHtml(url))res = getRecipePerCate(data)print(res)
这篇关于爬虫—美食天下各类菜谱的菜单信息的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!