本文主要是介绍python爬虫-爬小说,希望对大家解决编程问题提供一定的参考价值,需要的开发者们随着小编来一起学习吧!
# 导入BeautifulSoup
from bs4 import BeautifulSoup as bf
from fastapi import FastAPI,Form,File
import time
import random
import requests
import tracebackapp = FastAPI(title='爬虫',description='regex web: https://regexr-cn.com/ \n eg : <a href="https://www.zbytb.com/s-zb-.*?</a> \n eg : <a href="[./].*?</a>',version='1.0.0')headers = [{"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14"},{"User-Agent":"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"},{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"},{"User-Agent":"Opera/9.25 (Windows NT 5.1; U; en)"},{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},{"User-Agent":"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"},{"User-Agent":"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12"},{"User-Agent":"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9"},{"User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"},{"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "}
]proxys = []def wait():time.sleep(0.2)def getHeader():return random.choice(headers)def getProxy():return random.choice(proxys)def parseUrl(url):if(url.startswith('./')):url = url.replace('./','')return urldef start():try:list_html = requests.get('https://www.xjwxsw.com/xsmulu/27614204/', headers=getHeader())list_html.encoding = list_html.apparent_encodinglist_obj = bf(list_html.text, 'html.parser')atags = list_obj.find_all('div', id='content_1')[0].find_all('a')f = open('C://Users//admin//Desktop//777.txt', "a", encoding='utf-8')for atag in atags:title = atag.textprint(title)f.write(title)f.write("\n")href1 = 'https://www.xjwxsw.com'+atag.get('href')href2 = href1.split('.html')[0]+'_2.html'context1 = requests.get(href1, headers=getHeader())context1.encoding = context1.apparent_encodingcontext_obj1 = bf(context1.text, 'html.parser')ptags1 = context_obj1.find_all('div', id='booktxt')[0].find_all('p')for ptag1 in ptags1:f.write(ptag1.text)f.write("\n")context2 = requests.get(href2, headers=getHeader())context2.encoding = context2.apparent_encodingcontext_obj2 = bf(context2.text, 'html.parser')ptags2 = context_obj2.find_all('div', id='booktxt')[0].find_all('p')for ptag2 in ptags2:f.write(ptag2.text)f.write("\n")except Exception as e:traceback.print_exc()finally:f.close()
if __name__ == '__main__':start()
# 导入BeautifulSoup
from bs4 import BeautifulSoup as bf
from fastapi import FastAPI,Form,File
import time
import random
import requests
import tracebackapp = FastAPI(title='爬虫',description='regex web: https://regexr-cn.com/ \n eg : <a href="https://www.zbytb.com/s-zb-.*?</a> \n eg : <a href="[./].*?</a>',version='1.0.0')headers = [{"User-Agent":"Mozilla/5.0 (Windows NT 6.3; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/35.0.1916.153 Safari/537.36"},{"User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:30.0) Gecko/20100101 Firefox/30.0"},{"User-Agent":"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_2) AppleWebKit/537.75.14 (KHTML, like Gecko) Version/7.0.3 Safari/537.75.14"},{"User-Agent":"Mozilla/5.0 (compatible; MSIE 10.0; Windows NT 6.2; Win64; x64; Trident/6.0)"},{"User-Agent":"Mozilla/5.0 (Windows; U; Windows NT 5.1; it; rv:1.8.1.11) Gecko/20071127 Firefox/2.0.0.11"},{"User-Agent":"Opera/9.25 (Windows NT 5.1; U; en)"},{"User-Agent":"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)"},{"User-Agent":"Mozilla/5.0 (compatible; Konqueror/3.5; Linux) KHTML/3.5.5 (like Gecko) (Kubuntu)"},{"User-Agent":"Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12"},{"User-Agent":"Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9"},{"User-Agent":"Mozilla/5.0 (X11; Linux i686) AppleWebKit/535.7 (KHTML, like Gecko) Ubuntu/11.04 Chromium/16.0.912.77 Chrome/16.0.912.77 Safari/535.7"},{"User-Agent":"Mozilla/5.0 (X11; Ubuntu; Linux i686; rv:10.0) Gecko/20100101 Firefox/10.0 "}
]proxys = []def wait():time.sleep(0.2)def getHeader():return random.choice(headers)def getProxy():return random.choice(proxys)def parseUrl(url):if(url.startswith('./')):url = url.replace('./','')return urldef start():try:list_html = requests.get('https://www.uuks5.com/book/766295/', headers=getHeader())list_html.encoding = list_html.apparent_encodinglist_obj = bf(list_html.text, 'html.parser')atags = list_obj.find_all('ul', id='chapterList')[0].find_all('a')f = open('C://Users//admin//Desktop//123.txt', "a", encoding='utf-8')for atag in atags:title = atag.textprint(title)f.write(title)f.write("\n")href1 = 'https://www.uuks5.com/'+atag.get('href')context1 = requests.get(href1, headers=getHeader())context1.encoding = context1.apparent_encodingcontext_obj1 = bf(context1.text, 'html.parser')ptags1 = context_obj1.find_all('div', id='TextContent')[0].find_all('p')for ptag1 in ptags1:f.write(ptag1.text)f.write("\n")except Exception as e:traceback.print_exc()finally:f.close()
if __name__ == '__main__':start()
这篇关于python爬虫-爬小说的文章就介绍到这儿,希望我们推荐的文章对编程师们有所帮助!