之前见过别人写的抓取图片的python脚本,自己之前用正则写过,最近看到beautifulsoup 所以拿来练练手

# -*- coding:utf8 -*-
from bs4 import BeautifulSoup
import os, sys, urllib2,time,randompath = os.getcwd()                     
new_path = os.path.join(path,u'sexy')
if not os.path.isdir(new_path):os.mkdir(new_path)def page_loop(page=1):url = 'http://sexy.faceks.com/tag/美女摄影?page=%s' % pageprint urlcontent = urllib2.urlopen(url)soup = BeautifulSoup(content)my_girl = soup.findAll('a',attrs={'class':'img'})#先获取首页每个美女图片的进入链接for girl in my_girl:#link = girl.get('src')girlink = girl.get('href') print girlinkresponse = urllib2.urlopen(girlink)per_soup = BeautifulSoup(response)img_urls = per_soup.findAll('img',attrs={'class':None})#print img_urlsfor img_url in img_urls: #获取单个美女的所有图片链接 girlurl = img_url.get('src') print girlurl content2 = urllib2.urlopen(girlurl).read()with open(u'sexy'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999)),'wb') as code:code.write(content2)
page_loop()


效果图如下:

wKioL1YJDgDDT0EUAALvokfzBTI986.jpg

# -*- coding:utf8 -*-
# __author__ = 'jony'
from bs4 import BeautifulSoup
import os, sys, urllib2,time,random
import redef GetUrl():url = 'http://www.27270.com/ent/meinvtupian/'header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}request = urllib2.Request(url,None,header)response = urllib2.urlopen(request,None,timeout=10).read()#pattern = re.compile(r'<a href="(.*)" title="(.*)"> class="MMPic"><i><img src="(.*)" width="190" height="280"  alt=.*')#在一行无法正则获取,所以使用BeautifulSoupsoup = BeautifulSoup(response,"html.parser", from_encoding="gb18030") #WARNING:root:Some characters could not be decoded, and were replaced with REPLACEMENT CHARACTER.所以gb18030#soup = BeautifulSoup(response,from_encoding='gb2312')#过滤 div为MeinvTuPianBoxcontent = soup.find_all('div',attrs={'class':'MeinvTuPianBox'})#定义列表urls = []#titles = []#picurls = []for i in content:#再次过滤 MMpic 注意是a 不是div了for j in i.findAll('a',attrs={'class':'MMPic'}):urls.append(j.get('href'))#titles.append(j.get('title'))    return urls
def GetImage(*urls):header = {'User-Agent' : 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0)'}pattern = re.compile(r'<img alt=".*" src="(.*)" />')for url in urls:print url#获取初始的页面的图片try:request = urllib2.Request(url,None,header)response = urllib2.urlopen(request).read()girlink = pattern.search(response).group(1)print girlink  req = urllib2.Request(girlink,None,header)res = urllib2.urlopen(req,None,timeout=10).read()with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:code.write(res)except:continue          #http://www.27270.com/ent/meinvtupian/2016/156239_20.html 第二十张图片的网址orignurl=url.split('.html')[0]for i in range(2,15):picurl = '%s_%s.html' % (orignurl,i)#print picurltry:request = urllib2.Request(picurl,None,header)response = urllib2.urlopen(request).read()girlink = pattern.search(response).group(1)print girlink  req = urllib2.Request(girlink,None,header)res = urllib2.urlopen(req,None,timeout=10).read()with open(u'PICTURE'+'/'+time.strftime('%H%M%S')+str(random.randint(1000,9999))+u'.jpg','wb') as code:code.write(res)except:continue                 
if __name__ == '__main__':path = os.getcwd()                     new_path = os.path.join(path,u'PICTURE')if not os.path.isdir(new_path):os.mkdir(new_path)links = GetUrl()#print type(links)GetImage(*links)