python3.6+BeautifulSoup4 爬取360手机助手app应用的信息并存储数据库批量下载apk

本文主要是介绍python3.6+BeautifulSoup4 爬取360手机助手app应用的信息并存储数据库批量下载apk，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

源码：

#/usr/bin/python
#encoding:utf-8
'''
Created on 2018年01月12日
@author: xianqingchen
'''
import requests
from bs4 import BeautifulSoup
import os
from urllib.request import urlopen
import pymysql
def GetAppinfo(urlhead,page):
head = {}   #设置头 
head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0'
#获取url路径
get_url=urlhead;
#模拟浏览器，定制http请求头
try:
appinfo_html=requests.get(url=get_url,headers = head)
# UTF-8模式读取获取的页面信息标签和内容
appinfo_xml=BeautifulSoup(appinfo_html.text,'lxml');
#获取应用中的所有分类的标签xml
appin_subcalssxml1=appinfo_xml.find_all('ul',{"class":"select"})
appin_subcalssxml2=appin_subcalssxml1[0].find_all('a')
except:
print("父类标签页面，出现异常，终止")
# 连接数据库
connect=pymysql.Connect(
host='localhost',
port=3306,
user='root',
passwd='cecgw',
db='app',
charset='utf8'
)
# 获取游标
cursor = connect.cursor()
for appin_a in appin_subcalssxml2:
href=appin_a.get('href')
if href.find('/list/index/cid')==-1:
pass
else:
if href=='/list/index/cid/1/':
pass
else:
appsubclassname=appin_a.get_text()
for page in range(1,page+1):
dict1={}
try:
appin_subclaurl='http://zhushou.360.cn'+href+'?page='+str(page)
appinfo_html=requests.get(url=appin_subclaurl,headers = head)
appinfo_xml=BeautifulSoup(appinfo_html.text,'lxml');
appinfo_appullist=appinfo_xml.find_all('ul',{'class','iconList'})
appinfo_applilist=appinfo_appullist[0].find_all('li')
except:
print("appsubcalss exception",appin_subclaurl)
for appinfo_appxml in appinfo_applilist:
applinkt1=appinfo_appxml.find_all('h3')[0]
app_name=applinkt1.get_text()
apphref=applinkt1.find_all('a')[0].get('href')
#app的详情的url
appurl='http://zhushou.360.cn'+apphref
#                         print(appurl)
applinka=appinfo_appxml.find_all('a')
for applinkaa in applinka:
appa=applinkaa.get('class')
try:
if appa is not None:
if len(appa)==3:
if appa[2].find('normal')==-1:
pass
else:
#app的下载url
app_loadurl=applinkaa.get('href').split('url=')[1]
#                                         appDownload(app_loadurl)
#获取app详情的页码的xml           
appdeinfo_html=requests.get(url=appurl,headers = head)
appdeinfo_xml=BeautifulSoup(appdeinfo_html.text,'lxml');
appdepf=appdeinfo_xml.find_all('div',{'class':'pf'})[0]
appdebreif=appdeinfo_xml.find_all('div',{'class':'breif'})[0]
except:
print("appdeinfo  exception",appurl)
#app的详情
appscore=appdepf.find_all('span')[0].get_text()
appscounts=appdepf.find_all('span')[3].get_text().split('：')[1]
appsize=appdepf.find_all('span')[4].get_text()
appauthor=appdebreif.find_all('td')[0].get_text().split('：')[1]
appdate=appdebreif.find_all('td')[1].get_text().split('：')[1]
appver=appdebreif.find_all('td')[2].get_text().split('：')[1]
appsyst=appdebreif.find_all('td')[3].get_text().split('：')[1]
applan=appdebreif.find_all('td')[4].get_text().split('：')[1]
data=(appsubclassname,app_name,appscore,appscounts,appsize,appauthor,appdate,
appver,appsyst,applan,appurl,app_loadurl) 
try:                      
# 插入数据
sql ="""INSERT INTO AppInfo(subclass, appname, score,counts,size,author,update1,version,supsystem,language1,appurl,loadurl) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )"""
cursor.execute(sql % data)
connect.commit()
except:
print('数据库存储异常',data)
break;
break;
# 关闭连接
cursor.close()
connect.close()
def appDownload(url):
file_name=url.split('/')[-1].strip()
u = urlopen(url)
path=os.path.abspath("..")+'/pak/'
f = open(path+file_name, 'wb')
block_sz = 8192
while True:
buffer = u.read(block_sz)
if not buffer:
break
f.write(buffer)
f.close()
print ("Sucessful to download" + " " + file_name)
if __name__ == '__main__':
url='http://zhushou.360.cn/list/index/cid/1/';
page=1
app_dict=GetAppinfo(url,page)