网络爬虫,网上有很多实现方法。但是有很多都是C#写得,我就为java贡献点资源吧。
网络爬虫最简单的理解就是根据url,把url页面的内容下载到后台。
public List getUrl(String url) throws IOException{
System.out.println("请求地址为:" + url);
URL requestUrl = new URL(url);
// 打开链接
HttpURLConnection connection = (HttpURLConnection) requestUrl
.openConnection();
connection.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.connect();
InputStream is = connection.getInputStream();
String content;
通过该方法把页面上的内容下载之后,
<td width="306" rowspan="6"><p align="left">
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=92">蔡梅琳</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=35">陈琼</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=31">董守斌</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=88">董守玲</a>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=38">胡劲松</a><span style=""> </span><br />
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=33">李家春</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=75">李拥军</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=34">彭宏</a>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=26">齐德昱</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=6">王家兵</a><span style=""> </span><br >
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=45">许勇</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=30">张凌</a>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=36">张齐</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=39">赵跃龙</a><span style=""> </span>
<a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=32">周杰</a></p></td>
</tr>
再通过正则表达式把需要的url提取出来。
View Code
//从中抓取符合条件的url
Pattern pattern = Pattern.compile("http://202.38.194.240:8080/CSWeb/showTeacher.html\\?id=[0-9]{1,3}");
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
int start =matcher.start() ;
int end = matcher.end();
String href = content.substring(start, end);
System.out.println("url:"+href);
u
之后再一次访问你所要的页面,再把页面上你需要的信息通过正则表达式提取出来。
View Code
Pattern pattern = Pattern.compile("研究方向.*。?");
Matcher matcher = pattern.matcher(content);
String info = "";
while (matcher.find()) {
count++;
int start =matcher.start() ;
int end = matcher.end();
info = content.substring(start, end);
System.out.println("info"+info);
infoList.add(info);
}
这样的话就可以层层通过正则表达式不断的在该网站上提取url,之后把需要的信息通过正则表达式取下来。
View Code
package com;
import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class NetSpider {
private List urlList = new ArrayList();// url集合
private StringBuffer strBuffer = new StringBuffer("");
public static void main(String args[]) throws IOException{
NetSpider netSpider = new NetSpider();
netSpider.urlList = netSpider.getUrl("http://202.38.194.240:8080/CSWeb/tmaster.html");
List list = netSpider.getInfo(netSpider.urlList);
System.out.println("size:"+list.size());
}
public List getUrl(String url) throws IOException{
System.out.println("请求地址为:" + url);
URL requestUrl = new URL(url);
// 打开链接
HttpURLConnection connection = (HttpURLConnection) requestUrl
.openConnection();
connection.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.connect();
InputStream is = connection.getInputStream();
String content;
while ((is.read()) != -1) {
int all = is.available();
byte[] b = new byte[all];
is.read(b);
strBuffer.append(new String(b, "UTF-8"));
}
if (is != null)
is.close();
content = strBuffer.toString();
System.out.println("content"+content);
//从中抓取符合条件的url
Pattern pattern = Pattern.compile("http://202.38.194.240:8080/CSWeb/showTeacher.html\\?id=[0-9]{1,3}");
Matcher matcher = pattern.matcher(content);
while (matcher.find()) {
int start =matcher.start() ;
int end = matcher.end();
String href = content.substring(start, end);
System.out.println("url:"+href);
urlList.add(href);
}
return urlList;
}
//从各个url中抓取需要的信息
public List getInfo(List urlList) throws IOException{
List infoList = new ArrayList();// news集合
String url = "";
int count = 0;
for(int i=0;i<urlList.size();i++){
url = "";
url = (String) urlList.get(i);
System.out.println("请求地址为:" + url);
URL requestUrl = new URL(url);
// 打开链接
HttpURLConnection connection = (HttpURLConnection) requestUrl
.openConnection();
connection.setRequestProperty("User-Agent",
"Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
connection.connect();
InputStream is = connection.getInputStream();
String content = "";
strBuffer = new StringBuffer();
while ((is.read()) != -1) {
int all = is.available();
byte[] b = new byte[all];
is.read(b);
strBuffer.append(new String(b, "UTF-8"));
}
if (is != null)
is.close();
content = strBuffer.toString();
Pattern pattern = Pattern.compile("研究方向.*。?");
Matcher matcher = pattern.matcher(content);
String info = "";
while (matcher.find()) {
count++;
int start =matcher.start() ;
int end = matcher.end();
info = content.substring(start, end);
System.out.println("info"+info);
infoList.add(info);
}
}
return infoList;
}
}