本文主要是介绍【后续更新】python搜集上海二手房数据,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
源码如下:
import asyncio
import aiohttp
from lxml import etree
import logging
import datetime
import openpyxlwb = openpyxl.Workbook()
sheet = wb.active
sheet.append(['房源', '房子信息', '所在区域', '单价', '关注人数和发布时间', '标签'])
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s: %(message)s')
start = datetime.datetime.now()
class Spider(object):def __init__(self):self.semaphore = asyncio.Semaphore(6) # 信号量,控制协程数,防止爬的过快被反爬self.header = {"Host": "sh.lianjia.com","Referer": "https://sh.lianjia.com/ershoufang/","User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.89 Safari/537.36"}async def scrape(self, url):async with self.semaphore:await asyncio.sleep(3) # 添加等待时间session = aiohttp.ClientSession(headers=self.header)response = await session.get(url)result = await response.text()await session.close()return resultasync def scrape_index(self, page):url = f'https://sh.lianjia.com/ershoufang/pg{page}/'text = await self.scrape(url)await self.parse(text)async def parse(self, text):html = etree.HTML(text)lis = html.xpath('//*[@id="content"]/div[1]/ul/li')for li in lis:house_data = li.xpath('.//div[@class="title"]/a/text()')[0] # 房源house_info = li.xpath('.//div[@class="houseInfo"]/text()')[0] # 房子信息address = ' '.join(li.xpath('.//div[@class="positionInfo"]/a/text()')) # 位置信息price = li.xpath('.//div[@class="priceInfo"]/div[2]/span/text()')[0] # 单价 元/平米attention_num = li.xpath('.//div[@class="followInfo"]/text()')[0] # 关注人数和发布时间tag = ' '.join(li.xpath('.//div[@class="tag"]/span/text()')) # 标签sheet.append([house_data, house_info, address, price, attention_num, tag])logging.info([house_data, house_info, address, price, attention_num, tag])def main(self):# 100页的数据scrape_index_tasks = [asyncio.ensure_future(self.scrape_index(page)) for page in range(1, 101)]loop = asyncio.get_event_loop()tasks = asyncio.gather(*scrape_index_tasks)loop.run_until_complete(tasks)if __name__ == '__main__':spider = Spider()spider.main()wb.save('house2.xlsx')delta = (datetime.datetime.now() - start).total_seconds()print("用时:{:.3f}s".format(delta))
这个代码会触发链家反*机制,等3个小时就可以继续用。代码后续再修改。
这篇关于【后续更新】python搜集上海二手房数据的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!