本文主要是介绍python3.6+BeautifulSoup4 爬取360手机助手app应用的信息并存储数据库 批量下载apk,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
源码:
#/usr/bin/python #encoding:utf-8 ''' Created on 2018年01月12日 @author: xianqingchen ''' import requests from bs4 import BeautifulSoup import os from urllib.request import urlopen import pymysql def GetAppinfo(urlhead,page): head = {} #设置头 head['User-Agent'] = 'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0' #获取url路径 get_url=urlhead; #模拟浏览器,定制http请求头 try: appinfo_html=requests.get(url=get_url,headers = head) # UTF-8模式读取获取的页面信息标签和内容 appinfo_xml=BeautifulSoup(appinfo_html.text,'lxml'); #获取应用中的所有分类的标签xml appin_subcalssxml1=appinfo_xml.find_all('ul',{"class":"select"}) appin_subcalssxml2=appin_subcalssxml1[0].find_all('a') except: print("父类标签页面,出现异常,终止") # 连接数据库 connect=pymysql.Connect( host='localhost', port=3306, user='root', passwd='cecgw', db='app', charset='utf8' ) # 获取游标 cursor = connect.cursor() for appin_a in appin_subcalssxml2: href=appin_a.get('href') if href.find('/list/index/cid')==-1: pass else: if href=='/list/index/cid/1/': pass else: appsubclassname=appin_a.get_text() for page in range(1,page+1): dict1={} try: appin_subclaurl='http://zhushou.360.cn'+href+'?page='+str(page) appinfo_html=requests.get(url=appin_subclaurl,headers = head) appinfo_xml=BeautifulSoup(appinfo_html.text,'lxml'); appinfo_appullist=appinfo_xml.find_all('ul',{'class','iconList'}) appinfo_applilist=appinfo_appullist[0].find_all('li') except: print("appsubcalss exception",appin_subclaurl) for appinfo_appxml in appinfo_applilist: applinkt1=appinfo_appxml.find_all('h3')[0] app_name=applinkt1.get_text() apphref=applinkt1.find_all('a')[0].get('href') #app的详情的url appurl='http://zhushou.360.cn'+apphref # print(appurl) applinka=appinfo_appxml.find_all('a') for applinkaa in applinka: appa=applinkaa.get('class') try: if appa is not None: if len(appa)==3: if appa[2].find('normal')==-1: pass else: #app的下载url app_loadurl=applinkaa.get('href').split('url=')[1] # appDownload(app_loadurl) #获取app详情的页码的xml appdeinfo_html=requests.get(url=appurl,headers = head) appdeinfo_xml=BeautifulSoup(appdeinfo_html.text,'lxml'); appdepf=appdeinfo_xml.find_all('div',{'class':'pf'})[0] appdebreif=appdeinfo_xml.find_all('div',{'class':'breif'})[0] except: print("appdeinfo exception",appurl) #app的详情 appscore=appdepf.find_all('span')[0].get_text() appscounts=appdepf.find_all('span')[3].get_text().split(':')[1] appsize=appdepf.find_all('span')[4].get_text() appauthor=appdebreif.find_all('td')[0].get_text().split(':')[1] appdate=appdebreif.find_all('td')[1].get_text().split(':')[1] appver=appdebreif.find_all('td')[2].get_text().split(':')[1] appsyst=appdebreif.find_all('td')[3].get_text().split(':')[1] applan=appdebreif.find_all('td')[4].get_text().split(':')[1] data=(appsubclassname,app_name,appscore,appscounts,appsize,appauthor,appdate, appver,appsyst,applan,appurl,app_loadurl) try: # 插入数据 sql ="""INSERT INTO AppInfo(subclass, appname, score,counts,size,author,update1,version,supsystem,language1,appurl,loadurl) VALUES( '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s' )""" cursor.execute(sql % data) connect.commit() except: print('数据库存储异常',data) break; break; # 关闭连接 cursor.close() connect.close() def appDownload(url): file_name=url.split('/')[-1].strip() u = urlopen(url) path=os.path.abspath("..")+'/pak/' f = open(path+file_name, 'wb') block_sz = 8192 while True: buffer = u.read(block_sz) if not buffer: break f.write(buffer) f.close() print ("Sucessful to download" + " " + file_name) if __name__ == '__main__': url='http://zhushou.360.cn/list/index/cid/1/'; page=1 app_dict=GetAppinfo(url,page)
下载结果:
这篇关于python3.6+BeautifulSoup4 爬取360手机助手app应用的信息并存储数据库 批量下载apk的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!