爬虫综合大作业---分析《我不是药神》电影短评

本文主要是介绍爬虫综合大作业---分析《我不是药神》电影短评，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

作业要求来源：https://edu.cnblogs.com/campus/gzcc/GZCC-16SE2/homework/3075

《我不是药神》是这几年来我所看的电影里感触最大的，甚至成为豆瓣上多年来仅有的六部9.0评分以上的华语电影之一。

影片讲述了神油店老板程勇从一个交不起房租的男性保健品商贩，一跃成为印度仿制药“格列宁”独家代理商的故事

获取用户名和短评

name=x.xpath('//*[@id="comments"]/div[{}]/div[2]/h3/span[2]/a/text()'.format(i))
content=x.xpath('//*[@id="comments"]/div[{}]/div[2]/p/text()'.format(i))

主要代码

var logMap = {}
var fs = require('fs');
var iconv = require('iconv-lite');
var logger = fs.createWriteStream('./urlLog.log', {flags: 'a' // 'a' means appending (old data will be preserved)
})
function logPageFile(url) {if (!logMap[url]) {logMap[url] = true;logger.write(url + '\r\n');}
}
function postData(post_data, path, cb) {// // Build the post string from an object// var post_data = JSON.stringify({//     'data': data// });// An object of options to indicate where to post tovar post_options = {host: '127.0.0.1',port: '9999',path: '/' + path,method: 'POST',headers: {'Content-Type': 'application/json','Content-Length': Buffer.byteLength(post_data)}};var http = require('http');// Set up the requestvar post_req = http.request(post_options, function (res) {res.setEncoding('utf8');res.on('data', cb);});logger.write('request post data 1\r\n')// post the data
    post_req.write(post_data);logger.write('request post data 2\r\n')post_req.end();
}module.exports = {summary: 'a rule to modify response',* beforeSendResponse(requestDetail, responseDetail) {if (/movie\/1200486/i.test(requestDetail.url)) {logger.write('matched: ' + requestDetail.url + '\r\n');if (responseDetail.response.toString() !== "") {logger.write(responseDetail.response.body.toString());var post_data = JSON.stringify({'url': requestDetail.url,'body': responseDetail.response.body.toString()});logger.write("post comment to server -- ext");postData(post_data, 'douban_comment', function (chunk) {});}}},
};

import requests
from bs4 import BeautifulSoup
import time
import jieba
from wordcloud import WordCloud
from PIL import Image
import numpy as np
import matplotlib.pyplot as plotdef getHtml(url):try:r = requests.get(url,headers={'User-Agent': 'Mozilla/5.0'})r.raise_for_status()r.encoding = "utf-8"return r.textexcept:print("Failed!!!")f = open("E:/movieComment.txt",'wb+')
def getData(html):soup = BeautifulSoup(html,"html.parser")comment_list = soup.find('div',attrs={'class':'mod-bd'})for comment in comment_list.find_all('div',attrs={'class':'comment-item'}):comment_content = comment.find('span',attrs={'class':'short'}).get_text()f.write(comment_content.encode('UTF-8'))def seg_sentence():#创建停用词列表filefath = 'E:/stopwords.txt'stopwords = [line.strip() for line in open(filefath,'r').readlines()]#实现句子的分词final = ''fn1 = open("E:/movieComment.txt", 'r',encoding='utf-8').read() #加载爬取的内容sentence_seged = jieba.cut(fn1,cut_all=False) #结巴分词：精确模式fn2 = open("E:/new.txt", "w", encoding='utf-8')for word in sentence_seged:if word not in stopwords:if word != '\t':final +=wordfinal +=" "fn2.write(final)   #写入去掉停用词的内容def wordcloud():# 加载图片image = Image.open("E:/wc.jpg", 'r')img = np.array(image)# 词云cut_text = open('E:/new.txt', 'r', encoding='utf-8').read()  # 加载去掉停用词的内容wordcloud = WordCloud(mask=img,  # 使用该参数自动忽略height,widthheight=2000,  # 设置图片高度width=4000,  # 设置图片宽度background_color='white',max_words=1000,  # 设置最大词数max_font_size=400,font_path="C:\Windows\Fonts\msyh.ttc",  # 如有口型乱码问题,可进入目录更换字体).generate(cut_text)# 显示图片plot.imshow(wordcloud, interpolation='bilinear')plot.axis('off')  # 去掉坐标轴plot.show()        #直接显示#plot.savefig('E:/wc1.jpg') #存为图片def main():# 翻页处理 : max(start)=200k = 0  #start = ki = 0while k <200:url = 'https://movie.douban.com/subject/26752088/comments?start=' + str(k) + '&limit=20&sort=new_score&status=P'k += 20i += 1print("正在爬取第" + str(i) + "页的数据")#time.sleep(2) # 设置睡眠时间html = getHtml(url)getData(html)seg_sentence()wordcloud()
if __name__ == "__main__":main()