法国亚马逊商品采集Python爬虫

本文主要是介绍法国亚马逊商品采集Python爬虫，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

看着身边做亚马逊铺货的朋友，花大时间收集商品信息，学着写个脚本帮忙解决下问题。他们日常主要是抓取商品价格，商品图片，商品介绍等。

商品图片应该是最难获取的到的。可以在js里可以获取到完整的商品大图

这个文章主要参考二爷记博客的文章：https://blog.csdn.net/minge89/article/details/106417047/

1、商品标题的获取

其实直接取title应该更简单，我这里是取得页面内容的标题。

亚马逊商品页面html标题代码：<title>Echo Dot (3ème génération), Enceinte connectée avec Alexa, Tissu anthracite: Amazon.fr</title>

商品标题的获取：req.xpath('//h1[@id="title"]/span[@id="productTitle"]/text()')

2、商品属性的获取

先把所有轮播图的列表属性给提取出来，class=样式内容会根据商品品类不同会有变化：

req.xpath('//ul[@class="a-unordered-list a-nostyle a-button-list a-vertical a-spacing-top-micro"]/li')

商品颜色属性的获取

</div>

</button></span></span>
</span>
</div>

</span></li>

</ul>

进行了简单的格式化处理

productColors=req.xpath('//li[@id="color_name_"]//text()')
productColor=''.join(Colors)

商品图片的的获取

主要是找到图片链接费了不少力气，写入到js中了，没办法，只能用正则获取到图片链接。

imgs_text=re.findall(r'ImageBlockATF(.+?)return data;',html,re.S)[0]
imgs=re.findall(r'"large":"(.+?)","main":',imgs_text,re.S)

图片有轮播图图片和鼠标划过的大图片

产品详情页面的图片

一个页面大概有3万多行代码，要挖掘出自己需要的数据，需要慢慢分析，最麻烦的应该是图片数据了。

附源码，仅供参考，学习，交流：

#法国亚马逊商品采集
#20200524 by 微信：huguo00289
#https://www.amazon.fr/dp/B07CNJTCBB/ref=twister_B07RVPW2GT?_encoding=UTF8&th=1

# -*- coding=utf-8 -*-
import requests
from fake_useragent import UserAgent
import re,os,time,random
from lxml import etree
def ua()
ua=UserAgent();
headers={"User-Agent":ua.random}
return headers

def get_data(url):
id=re.findall(r'dp/(.+?)/',url,re.S)[0]
print(f'>>>您输入的商品链接id为：{id},正在采集，请稍后..')
response=requests.get(url,headers=ua(),timeout=8)
time.sleep(2)
if response.status_code == 200:
print(">>>恭喜，获取网页数据成功！")
html=response.content.decode('utf-8')
with open(f'{id}.html','w',encoding='utf-8') as f:
f.write(html)
req=etree.HTML(html)
h1=req.xpath('//h1[@id="title"]/span[@id="productTitle"]/text()')
print(h1)
h1=h1[0].strip()
print(f'商品标题：{h1}')
productDescriptions=req.xpath('//div[@id="productDescription"]//text()')
productDescription=''.join(productDescriptions)
print(f'商品描述：{productDescription}')
imgs_text=re.findall(r'ImageBlockATF(.+?)return data;',html,re.S)[0]
imgs=re.findall(r'"large":"(.+?)","main":',imgs_text,re.S)
print(imgs)
text=f'商品标题：{h1}\n商品描述：{productDescription}\n商品图片{imgs}'
with open(f'{id}.txt','w',encoding='utf-8') as f:
f.write(text)
print(f">>>恭喜，保存商品数据成功，已保存为{id}.txt")
lis=req.xpath('//ul[@class="a-unordered-list a-nostyle a-button-list a-declarative a-button-toggle-group a-horizontal a-spacing-top-micro swatches swatchesSquare"]/li')
if len(lis)>1:
print(f">>>商品存在分类属性，共有{len(lis)}分类！")
spans=req.xpath('//div[@class="twisterTextDiv text"]/span[@class="a-size-base"]/text()')
print(spans)

if __name__ == '__main__':
print("亚马逊采集工具-by 微信公众号：二爷记")
print("BUG反馈微信：huguo00289");
print("请输入要采集的网址，按回车运行");

try:
get_data(url)
except Exception as e:
if "port=443" in e:
print("获取网页链接超时，正在重试..")
get_data(url)
print("采集完毕！")
print("8s后，程序自动关闭，BUG反馈微信：huguo00289")
time.sleep(8)

下面是美国亚马逊爬虫的参考代码

# -*- coding: utf-8 -*-
"""
File Name： amzone
Description :
Author : meng_zhihao
mail : 312141830@qq.com
date： 2019/5/8
"""
# 美国amazon
import requests,urllib
import datetime
from urllib.parse import quote, unquote
from selenium_operate import ChromeOperate
import re
import time
from crawl_tool_for_py3 import crawlerTool as ct
import os,base64
import xlsxwriter
from PIL import Image
DOMAIN = 'https://www.amazon.de'

HEADERS = { 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_1_1 like Mac OS X) AppleWebKit/602.2.14 (KHTML, like Gecko) Mobile/14B100 MicroMessenger/6.3.22 NetType/WIFI Language/zh_CN'
}
se = requests.session()

def img_resize(infile,outfile):
im = Image.open(infile)
# (x, y) = im.size # read image size
x_s = 120 # define standard width
y_s = 160 # calc height based on standard width
out = im.resize((x_s, y_s), Image.ANTIALIAS) # resize image with high-quality
out.save(outfile)

def gen_xls(item_infos):
timestamp = datetime.datetime.now().strftime("%Y%m%d%H%M%S")
book = xlsxwriter.Workbook('amazon%s.xlsx'%timestamp)
worksheet = book.add_worksheet('demo')
worksheet.write_row(0,0, ['关键词','排名','宝贝图片','价格','宝贝类目','宝贝描述','宝贝链接'])
worksheet.set_column('A:D', 15) # 列宽约等于8像素行高约等于1.37像素
worksheet.set_column('C:C', 20)
worksheet.set_column('B:B', 10)
worksheet.set_column('F:F', 50)
for i in range(len(item_infos)):
col = i+1
try:
item_info = item_infos[i]
row = [item_info['keyword'],item_info['rank'],'',item_info['price'],item_info['cat'],item_info['descriptions'],item_info['item_url']]
worksheet.write_row(col,0, row)
worksheet.set_row(col, 120)
if 'item_pic_base64' in item_info:
item_pic_base64 = item_info["item_pic_base64"]
try:
if 'https:' in item_pic_base64:
data = ct.get(item_pic_base64)
else:
data = base64.b64decode(item_pic_base64)
with open('test.png', 'wb') as f:
f.write(data)
img_resize('test.png', 'img/tmp%s.png'%i)
worksheet.insert_image( col,2, 'img/tmp%s.png'%i) # 名字必须不同
except Exception as e:
print(str(e))
except Exception as e:
print(str(e))
print('完成结果数,%s'%col)
book.close()

def extractor_page(page): # 解析宝贝页
item_info = {"descriptions":""}
descriptions = ct.getXpath('//div[@id="productDescription"]/p/text()',page)
if not descriptions:
descriptions = ct.getXpath( '//div[@id="aplus"]/div//p//text()', page)
descriptions= ''.join([description.strip() for description in descriptions])
item_info["descriptions"] = descriptions
item_pic_base64 = ct.getXpath1( '//div[@id="imgTagWrapperId"]/img/@src', page).split('base64,')[-1]
item_info["item_pic_base64"] = item_pic_base64
price = ct.getXpath1( '//span[@id="priceblock_ourprice"]/text()', page)
item_info["price"] = price
cats = ct.getXpath( '//div[@id="wayfinding-breadcrumbs_container"]//a/text()', page)
item_info["cat"] = '/'.join([cat.strip() for cat in cats])
for k in item_info:
print(k)
return item_info

if __name__ == '__main__':
#start_url = 'https://xueqiu.com/v4/statuses/public_timeline_by_category.json?since_id=-1&count=15&category=105'
csv_rows=[]
cookie = {}
item_infos = []
cop = ChromeOperate(executable_path=r'chromedriver.exe')
cop.open(DOMAIN)
with open('keywords.txt','r') as keyword_file:
for line in keyword_file:
line = line.strip()
if not line:
continue
urls = [DOMAIN+'/s?k=%s&ref=nb_sb_noss_2'%quote(line),
# 'https://www.amazon.com/s?k=%s&ref=nb_sb_noss_2&page=2 ' % quote(line)
]
rank = 0
for url in urls:
# HEADERS.update({"Referer":url,"User-Agent":random.choice(USER_AGENT_POOL)})
cop.open(url)
page = cop.open_source()
item_urls = ct.getXpath('//div[@class="sg-row"]//div[@class="sg-col-inner"]//h2/a/@href',page)
if not item_urls:
print(page)
for item_url in item_urls:
rank += 1
try:
if not 'qid' in item_url:
continue
else:
item_url = DOMAIN+item_url
cop.open(item_url)
page = cop.driver.page_source
if 'Kindle Edition' in page:
continue
item_info = extractor_page(page)
if 'Type the characters you see' in page :
print('IP被封了',url)
time.sleep(10)
# print page
break
item_info['keyword'] = line
item_info['rank'] = rank
item_info['item_url'] = item_url.split('?')[0]
item_infos.append(item_info)
except Exception as e:
print(str(e))
gen_xls(item_infos)
cop.quit()