在完成了搜索功能后,要开始尝试上次助教所说的一个网页同时具备爬虫和搜索的功能。对于这一点我作了如下的构想:
我的网页主页是一个搜索页面也就是前面已经做好的页面,然后在此基础上用CSS对页面进行美化同时加上一栏导航栏。导航栏中由主页、新闻爬取、关于这三项组成。
其中新闻爬取会打开一个新的网页其中会有三个按钮来触发爬虫(分别对应三个网站)。然后关于中会打开一个新的网页其中以文字形式附相关代码及说明。
那么首先就是要把之前构建的服务器响应的请求扩大。当然这其实很简单,只不过再多写几个类似的函数然后把爬虫代码放在其中就好。最后得到以下超长代码(个人能力有限觉得没法把三个网站的爬虫合并只能一一写,所以合起来特别长):
var mysql = require('mysql');
var connection = mysql.createConnection({
host : 'localhost',
user : 'root',
password : 'root',
port: '3306',
database: 'crawl'
});
connection.connect();
var express = require('express');
var app = express();
app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
res.sendFile(__dirname + "/public/" + "4.20.html");
})
app.get('/process_get', function(req, res) {
// 输出 JSON 格式
keyword=req.query.keyword;
var sql="select url,title,keywords,publish_date,author from fetches where keywords like '%"+keyword+"%'";
connection.query(sql,function (err, result) {
if(err){
console.log('[SELECT ERROR] - ',err.message);
return;
}
res.setHeader('Content-Type', 'text/html; charset=utf-8');
console.log(result);
res.end(JSON.stringify(result,null,'<br>'));
})})
app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
res.sendFile(__dirname + "/public/" + "4.21.html");
})
app.get('/process_get2', function(req, res) {
// 输出 JSON 格式
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var mysql = require('./mysql.js');
var source_name = "中国新闻网";
var domain = 'http://www.chinanews.com/';
var myEncoding = "utf-8";
var seedURL = 'http://www.chinanews.com/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = "$('title').text()";
var date_format = "$('#pubtime_baidu').text()";
var author_format = "$('#editor_baidu').text()";
var content_format = "$('.left_zw').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format = "$('#source_baidu').text()";
var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)/
//防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
//request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的
else if (href.startsWith('//')) myURL = 'http:' + href; 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", ""); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要
// var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
// "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
// 存储json
// fs.writeFileSync(filename, JSON.stringify(fetch));
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
//执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
}); //mysql写入
});
}
res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.end('爬取代码成功')
})
app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
res.sendFile(__dirname + "/public/" + "4.21.html");
})
app.get('/process_get3', function(req, res) {
// 输出 JSON 格式
var mysql = require('./mysql.js');
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var source_name = "网易新闻";
var domain = 'https://news.163.com/';
var myEncoding = "GBK";
var seedURL = 'https://news.163.com/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = "$('.post_time_source').text()"|"$(#ptime).text()";
var author_format =" $('meta[name=\"author\"]').eq(0).attr(\"content\")";
var content_format = "$('#endText').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format ="$('#ne_article_source').text()";
var url_reg =/\/(\d{2})\/(\d{4})\/(\d{2})\/(\w{10,30}).html/;
var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)|(\d{4}\-\d{2}\-\d{2})/
//防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
//request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('https://') >= 0) myURL = href //http://开头的
else if (href.toLowerCase().indexOf('http://') >= 0) myURL = href
else if (href.startsWith('//')) myURL = 'https:' + href 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)){
return;} //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", ""); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要
// var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
// "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
// 存储json
// fs.writeFileSync(filename, JSON.stringify(fetch));
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
//执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
}); //mysql写入
});
}
res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.end('爬取代码成功')
})
app.use('/public', express.static('public'));
app.get('/index', function(req, res) {
res.sendFile(__dirname + "/public/" + "4.21.html");
})
app.get('/process_get4', function(req, res) {
// 输出 JSON 格式
var mysql = require('./mysql.js');
var fs = require('fs');
var myRequest = require('request');
var myCheerio = require('cheerio');
var myIconv = require('iconv-lite');
require('date-utils');
var source_name = "网易新闻";
var domain = 'http://www.people.com.cn/';
var myEncoding = "GBK";
var seedURL = 'http://www.people.com.cn/';
var seedURL_format = "$('a')";
var keywords_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";
var title_format = " $('meta[property=\"og:title\"]').eq(0).attr(\"content\")";
var date_format = " $('meta[name=\"publishdate\"]').eq(0).attr(\"content\")";
var author_format =" $('meta[name=\"author\"]').eq(0).attr(\"content\")";
var content_format = "$('.box_con').text()";
var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";
var source_format =" $('meta[name=\"source\"]').eq(0).attr(\"content\")";
var url_reg =/\/(\w{2})\/(\d{4})\/(\d{4})\/(\w{5,10}\-\w{8,15}).html/;
var regExp = /(\d{4}\-\d{2}\-\d{2})/
//防止网站屏蔽我们的爬虫
var headers = {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'
}
//request模块异步fetch url
function request(url, callback) {
var options = {
url: url,
encoding: null,
//proxy: 'http://x.x.x.x:8080',
headers: headers,
timeout: 10000 //
}
myRequest(options, callback)
};
seedget();
function seedget() {
request(seedURL, function(err, res, body) { //读取种子页面
// try {
//用iconv转换编码
var html = myIconv.decode(body, myEncoding);
//console.log(html);
//准备用cheerio解析html
var $ = myCheerio.load(html, { decodeEntities: true });
// } catch (e) { console.log('读种子页面并转码出错:' + e) };
var seedurl_news;
try {
seedurl_news = eval(seedURL_format);
} catch (e) { console.log('url列表所处的html块识别出错:' + e) };
seedurl_news.each(function(i, e) { //遍历种子页面里所有的a链接
var myURL = "";
try {
//得到具体新闻url
var href = "";
href = $(e).attr("href");
if (href == undefined) return;
if (href.toLowerCase().indexOf('https://') >= 0) myURL = href //http://开头的
else if (href.toLowerCase().indexOf('http://') >= 0) myURL = href
else if (href.startsWith('//')) myURL = 'https:' + href 开头的
else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href //其他
} catch (e) { console.log('识别种子页面中的新闻链接出错:' + e) }
if (!url_reg.test(myURL)){
return;} //检验是否符合新闻url的正则表达式
//console.log(myURL);
var fetch_url_Sql = 'select url from fetches where url=?';
var fetch_url_Sql_Params = [myURL];
mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {
if (vals.length > 0) {
console.log('URL duplicate!')
} else newsGet(myURL); //读取新闻页面
});
});
});
};
function newsGet(myURL) { //读取新闻页面
request(myURL, function(err, res, body) { //读取新闻页面
//try {
var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码
//console.log(html_news);
//准备用cheerio解析html_news
var $ = myCheerio.load(html_news, { decodeEntities: true });
myhtml = html_news;
//} catch (e) { console.log('读新闻页面并转码出错:' + e);};
console.log("转码读取成功:" + myURL);
//动态执行format字符串,构建json对象准备写入文件或数据库
var fetch = {};
fetch.title = "";
fetch.content = "";
fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");
//fetch.html = myhtml;
fetch.url = myURL;
fetch.source_name = source_name;
fetch.source_encoding = myEncoding; //编码
fetch.crawltime = new Date();
if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcename
else fetch.keywords = eval(keywords_format);
if (title_format == "") fetch.title = ""
else fetch.title = eval(title_format); //标题
if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期
console.log('date: ' + fetch.publish_date);
fetch.publish_date = regExp.exec(fetch.publish_date)[0];
fetch.publish_date = fetch.publish_date.replace('年', '-')
fetch.publish_date = fetch.publish_date.replace('月', '-')
fetch.publish_date = fetch.publish_date.replace('日', '')
fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");
if (author_format == "") fetch.author = source_name; //eval(author_format); //作者
else fetch.author = eval(author_format);
if (content_format == "") fetch.content = "";
else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定
if (source_format == "") fetch.source = fetch.source_name;
else fetch.source = eval(source_format).replace("\r\n", ""); //来源
if (desc_format == "") fetch.desc = fetch.title;
else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要
// var filename = source_name + "_" + (new Date()).toFormat("YYYY-MM-DD") +
// "_" + myURL.substr(myURL.lastIndexOf('/') + 1) + ".json";
// 存储json
// fs.writeFileSync(filename, JSON.stringify(fetch));
var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' +
'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';
var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding,
fetch.title, fetch.keywords, fetch.author, fetch.publish_date,
fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content
];
//执行sql,数据库中fetch表里的url属性是unique的,不会把重复的url内容写入数据库
mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {
if (qerr) {
console.log(qerr);
}
}); //mysql写入
});
}res.setHeader('Content-Type', 'text/html; charset=utf-8');
res.end('爬取代码成功')
})
var server = app.listen(8081, function() {
var host = server.address().address
var port = server.address().port
console.log("访问地址为 http://%s:%s", host, port)
})
在把服务器更新完毕后,我们先对4.21.html也就是新闻爬虫所打开的新网页进行大概的测试,先把他简单的搞成之前的4.20.html内容复制三遍,也就是:
三个按钮分别对应三个请求。
然后我们用命令行打开服务器:
在对html中三个按钮一一测试。这里用第一个为例(对应中国新闻网):
点击后在服务器端我们可以看到其实就是相当于运行了我们之前的爬虫代码:
然后在服务器代码中我们res.end了一个”爬取代码成功“所以在浏览器上会出现:
这样一来我们测试就完成了。然后就是对4.20.html,4.21.html进行页面上的优化:
首先是4.20.html也就是主页面搜索页面采用了一些CSS样式并且还放上了一张个人觉得挺符合这个网页气质的背景图片后得到如下代码:
<!DOCTYPE html>
<html>
<title>新闻内容爬取关键词查询</title>
<meta charset="utf-8">
<style type="text/css">
body{background-image: url(./back.jpg);background-repeat: fixed;background-repeat: no-repeat;background-size: 150%;}
input.keyword{
position: relative;
top: 60px;
left: 450px;
z-index: -1;
padding: 14px 250px;
}
input.button{
position: relative;
top:150px;
left: 650px;
z-index: -1;
padding: 10px 100px;
}
ul{
list-style-type: none;
margin:0;
padding: 0;
overflow: hidden;
background-color: grey;
}
li{float: left;}
li a{
display: block;
color:blue;
text-align: center;
padding: 14px 232px;
text-decoration: none;
}
li a:hover{
background-color: #555;
color: red;
}
section{
position: relative;
top: 60px;
left: 500px;
z-index: -1;
}
section a{
color: white;
font-size: 20px;
font-weight: bold;
}
section a:hover{
color: red;
}
</style>
<body>
<ul>
<li>
<a href="4.20.html">主页</a>
</li>
<li><a href="4.21.html">新闻网站</a></li>
<li><a href="关于.html">关于</a></li>
</ul>
<br>
<form action="http://127.0.0.1:8081/process_get" method="GET">
<br> <input type="text" name="keyword" class="keyword">
<br>
<input type="submit" value="搜索" class="button">
</form>
</body>
</html>
然后得到优化后的网页页面如下:
然后再对4.21.html也就是新闻爬虫界面进行编写,这里其实就是简单的三个按钮而已:
<!DOCTYPE html>
<html>
<title>新闻内容爬取关键词查询</title>
<meta charset="utf-8">
<style type="text/css">
body{background-image: url(./back.jpg);background-repeat: fixed;background-repeat: no-repeat;background-size: 150%;}
input{
position: relative;
left: 600px;
padding: 20px 100px;
}
</style>
<body>
<br><br><br>
<form action="http://127.0.0.1:8081/process_get2" method="GET">
<input type="submit" value="爬取中国新闻网">
</form>
<br>
<br>
<form action="http://127.0.0.1:8081/process_get3" method="GET">
<br>
<br>
<input type="submit" value="爬取网易新闻 ">
</form>
<br>
<br>
<form action="http://127.0.0.1:8081/process_get4" method="GET">
<br>
<br>
<input type="submit" value="爬取人民网 ">
</form>
</body>
然后呈现页面如下:
分别点击按钮后的效果在上面测试中已经呈现。
然后关于.html中给出爬虫代码,呈现如下页面:
综上这个项目我个人觉得就算完成的差不多了吧。