学了几日Python爬虫,做了一个无聊的爬虫。。。
# -*- coding: utf-8 -*- import urllib.request import repage = urllib.request.urlopen("https://movie.douban.com/") cnt = page.read().decode("utf-8")#首次过滤 name = re.findall(r'href="https://movie\.douban\.com/subject/\d+?/\?from=showing" class="">.+?</a>',cnt) rate = re.findall(r'<span class="subject-rate">\d\.\d</span>|<span class="text-tip">暂无评分</span>|<span class="rating-type-score">\d\.\d</span>',cnt)#进一步匹配和过滤 name2=[] for na in name:nametmp = re.search(r'class="">.+?</a>', na).group(0)nametmp = nametmp[9:]nametmp = nametmp[:-4]name2.append(nametmp) rate2=[] for ra in rate:ratmp = re.search(r">.+?<", ra).group(0)ratmp = ratmp[1:]ratmp = ratmp[:-1]rate2.append(ratmp)print(len(name2), len(rate2)) ziped = zip(name2, rate2) with open("douban.html",'+w', encoding="utf-8") as f:f.write(""" <!DOCTYPE html> <head> <meta charset="utf-8"> <style> h3{ color:#71c084 } </style> </head> <body> <h3>豆瓣网热门集锦-Powered by python</h3> <ul> """)for na,ra in ziped:f.write("<li>"+na+"\t"+ra+"</li>\n")f.write(""" </ul> </body> <html> """)f.close()
效果是这样滴: