node.js学习笔记--HTTP之小爬虫

本文主要是介绍node.js学习笔记--HTTP之小爬虫，希望对大家解决编程问题提供一定的参考价值，需要的开发者们随着小编来一起学习吧！

注：此博客是在学习进击Node.js基础（一）这门课程时的学习笔记，感谢Scott老师的课程。

一、开启HTTP请求

var http = require('http') //调用http模块http.createServer(function(req, res){ //res是response响应，req是request请求res.writeHead(200,{'Content-Type':'text/plain'}) //定义响应头，内容类型为textres.write('Hello Nodejs') //定义响应内容res.end() //结束响应的定义}) //createServer就是创建服务器.listen(2018)

二、写一个小爬虫

var http = require('http')
var url = 'http://www.imooc.com/learn/348'http.get(url, function(res){var html = ''res.on('data', function(data){html += data})  //收到数据data时这个事件就会不断被触发，html字符串就不断累加res.on('end',function(){console.log(html)})  //end事件
}).on('error', function(){console.log('获取课程数据出错')
})
//http.get还可以注册error事件，当出现异常时能捕捉错误

不过这里html里装的全是字符串，而我们需要识别标签来提取数据，所以再写一个升级版

三、写一个升级版小爬虫

爬取慕课网上这门课的课程标题和ID

var http = require('http')
var cheerio = require('cheerio')   //一个像JQuery语法一样可以提供快捷检索的库
var url = 'http://www.imooc.com/learn/348'function filterChapters(html){var $ = cheerio.load(html)var chapters = $('.mod-chapters')//网页上的数据结构// [{//  chapterTitle: '',//  videos: [//      title: '',//      id: ''//  ]// }]var courseData = []//对每一章进行遍历chapters.each(function(item){var chapter = $(this) //拿到每个单独的章节var chapterTitle = chapter.find('strong').text()var videos =  chapter.find('.video').children('li')var chapterData = {chapterTitle: chapterTitle,videos: []} //组装对象//对videos进行遍历videos.each(function(item){var video = $(this).find('.J-media-item') //拿到每个单独的video里的classvar videosTitle = video.text() //返回该元素下的所有文本内容var id =  video.attr('href').split('video/')[1]  //要拿到href链接里video/后的内容即视频idchapterData.videos.push({title: videosTitle,id: id})})courseData.push(chapterData) //把拿好的章节数据放进数组})return courseData
}function printCourseInfo(courseData){courseData.forEach(function(item){  //对courseData这个数组进行遍历var chapterTitle = item.chapterTitleconsole.log(chapterTitle + '\n')item.videos.forEach(function(video){console.log('(' + video.id + ')' + video.title + '\n')})})
}http.get(url, function(res){var html = ''res.on('data', function(data){html += data})  //收到数据data时这个事件就会不断被触发，html字符串就不断累加res.on('end',function(){var courseData = filterChapters(html)printCourseInfo(courseData)})  //end事件
}).on('error', function(){console.log('获取课程数据出错')
})
//http.get还可以注册error事件，当出现异常时能捕捉错误