网络爬虫(又被称为网页蜘蛛,网络机器人,在FOAF社区中间,更经常的称为网页追逐者),是一种按照一定的规则,自动地抓取万维网信息的程序或者脚本。另外一些不常使用的名字还有蚂蚁、自动索引、模拟程序或者蠕虫。使用http模块实现小爬虫。

HTTP小爬虫

var http = require('http');
var url = "";
http.get(url, function(res){
	var html = '';
	
	res.on('data', function(data){
		html += data;
	});
	
	res.on('end', function(){
		console.log(html);
	});
}).on('error', function(){
	console.log('error');
});

解析html结构

var http = require('http');
var cheerio = require('cheerio');//引入cheerio模块,需先安装这个外部模块到环境
var url = "";
http.get(url, function(res){
	var html = '';
	
	res.on('data', function(data){
		html += data;
	});
	
	res.on('end', function(){
		var courseData = filterHtml(html);
		printCourse(courseData);
	});
}).on('error', function(){
	console.log('error');
});
//解析HTML函数
function filterHtml(html){
	var $ = cheerio.load(html);
	var chapters = $("");
	var courseData = [];
	chapters.each(function(){
		var chapter = $(this);
		var chapterTitle = chapter.find('strong').text();
		var videos = chapter.find('.video').children('li');
		var chapterData = [{
			chapterTitle: chapterTitle,
			videos: []
		}];
		videos.each(function(){
			var video = $(this).find('.studyvideo');
			var videoTitle = video.text();
			var id = video.attr('href').split('video/')[1];
			chapterDate.videos.push({
				title: videoTitle,
				id: id
			});
		});
		courseData.push(chapterData);
	});
}

function printCourse(course){
	couse.forEach(function(item){
		var cTitle = item.chapterTitle;
		console.log(cTitle + '\n');
		item.videos.forEach(function(video){
			console.log('[' + video.id + ']' + video.title + '\n');
		});
	});
}