本文主要是介绍【Pyhton爬虫实战】爬取京东商城的商品信息,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
爬取京东商城的商品信息,并将数据以Dataframe形式展示
from selenium.webdriver import Chrome, ChromeOptions
import re
import pymysql
from selenium.webdriver.common.by import By
import collections
import pandas as pdclass Spider():def __init__(self):self.browser = Noneself.info_diclist = list()def onepage_info_by_selenium(self, browser, url):"""获取详情页数据"""# browser.get("https://item.jd.com/100098751450.html")browser.get(url)grand = browser.find_element(by=By.CSS_SELECTOR, value="ul[id='parameter-brand'][class='p-parameter-list']")temp_dic = dict()# 品牌单独获取temp = re.split(r":\s{0,}", grand.text)temp_dic[temp[0]] = temp[1]info = browser.find_element(by=By.CSS_SELECTOR, value="ul[class='parameter2 p-parameter-list']")temp = re.findall(r"\S+:\s{0,}\S+", info.text)for i in temp:j = re.split(r":\s{0,}", i)temp_dic[j[0]] = j[1]# 获取好评差评# browser.find_element(by=By.CSS_SELECTOR, value="[data-tab='trigger'][data-anchor='#comment']").click()self.info_diclist.append(temp_dic)# input("end: ")def getmain_by_selenium(self):"""使用selenium操作主页并写入cookie"""browser = Chrome()browser.get('https://www.jd.com')cookies = self.get_cookie()for cookie in cookies:browser.add_cookie(cookie)return browserdef get_search_result(self, browser, key) -> list:"""将查询到的所有商品连接保存到列表"""inputbar = browser.find_element(by=By.ID, value='key')inputbar.send_keys(key)browser.find_element(by=By.CSS_SELECTOR, value="[class='button'][aria-label='搜索']").click()browser.implicitly_wait(5)hrefs = browser.find_elements(by=By.CSS_SELECTOR, value="div[class='p-img'] a[target='_blank']")return [href.get_attribute(name='href') for href in hrefs]def get_info(self):"""获取详细信息"""browser = self.getmain_by_selenium()results = self.get_search_result(browser, "投影仪")for url in results[:3]:self.onepage_info_by_selenium(browser, url=url)self.show_as_dataframe()input("end: ")def show_as_dataframe(self):"""将字典展示为Dataframe"""data = pd.DataFrame(self.info_diclist)print(data)def get_cookie(self):"""从数据库中获取cookie"""db = pymysql.connect(host='127.0.0.1',user='root',password='123456',charset='utf8',database='draft',port=3306)cursor = db.cursor()cursor.execute(query="SELECT COOKIE FROM COOKIES WHERE WEB_NAME='jingdong'")cookie = eval(cursor.fetchall()[0][0])cursor.close()db.close()return cookiedef main():Spider().get_info()if __name__ == '__main__':main()
这篇关于【Pyhton爬虫实战】爬取京东商城的商品信息的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!