网络爬虫,网上有很多实现方法。但是有很多都是C#写得,我就为java贡献点资源吧。

  网络爬虫最简单的理解就是根据url,把url页面的内容下载到后台。

  

public List getUrl(String url) throws IOException{
        System.out.println("请求地址为:" + url);
        URL requestUrl = new URL(url);
        // 打开链接
        HttpURLConnection connection = (HttpURLConnection) requestUrl
                .openConnection();
        connection.setRequestProperty("User-Agent",
                "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        connection.connect();
        
        InputStream is = connection.getInputStream();
        String content;

通过该方法把页面上的内容下载之后,

<td width="306" rowspan="6"><p align="left">  
    <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=92">蔡梅琳</a><span style="">  </span>
    <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=35">陈琼</a><span style="">  </span>
    <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=31">董守斌</a><span style="">  </span>
    <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=88">董守玲</a>
    <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=38">胡劲松</a><span style="">  </span><br /> 
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=33">李家春</a><span style="">  </span>
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=75">李拥军</a><span style="">  </span>
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=34">彭宏</a>
    <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=26">齐德昱</a><span style="">  </span>
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=6">王家兵</a><span style="">  </span><br > 
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=45">许勇</a><span style="">  </span>
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=30">张凌</a>
    <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=36">张齐</a><span style="">  </span>
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=39">赵跃龙</a><span style="">  </span>
  <a title="点击导师姓名查看该导师详细简介" target="_blank" href="http://202.38.194.240:8080/CSWeb/showTeacher.html?id=32">周杰</a></p></td>
    </tr>

再通过正则表达式把需要的url提取出来。

java爬虫工具抓取视频教程 java爬虫代码示例_html

java爬虫工具抓取视频教程 java爬虫代码示例_System_02

View Code

//从中抓取符合条件的url
        Pattern pattern = Pattern.compile("http://202.38.194.240:8080/CSWeb/showTeacher.html\\?id=[0-9]{1,3}");
        Matcher matcher = pattern.matcher(content);
        while (matcher.find()) {
            int start =matcher.start() ;
            int end = matcher.end();
            String href = content.substring(start, end);
            System.out.println("url:"+href);
            u

之后再一次访问你所要的页面,再把页面上你需要的信息通过正则表达式提取出来。

java爬虫工具抓取视频教程 java爬虫代码示例_html

java爬虫工具抓取视频教程 java爬虫代码示例_System_02

View Code

Pattern pattern = Pattern.compile("研究方向.*。?");
            Matcher matcher = pattern.matcher(content);
            String info = "";
            while (matcher.find()) {
                count++;
                int start =matcher.start() ;
                int end = matcher.end();
                info = content.substring(start, end);
                System.out.println("info"+info);
                infoList.add(info);
            }

这样的话就可以层层通过正则表达式不断的在该网站上提取url,之后把需要的信息通过正则表达式取下来。

java爬虫工具抓取视频教程 java爬虫代码示例_html

java爬虫工具抓取视频教程 java爬虫代码示例_System_02

View Code

package com;

import java.io.IOException;
import java.io.InputStream;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

public class NetSpider {
    private  List urlList = new ArrayList();// url集合
    
    private StringBuffer strBuffer = new StringBuffer("");
    
    public static void main(String args[]) throws IOException{
        NetSpider netSpider = new NetSpider();
        netSpider.urlList = netSpider.getUrl("http://202.38.194.240:8080/CSWeb/tmaster.html");
        List list = netSpider.getInfo(netSpider.urlList);
        System.out.println("size:"+list.size());
    }
    
    public List getUrl(String url) throws IOException{
        System.out.println("请求地址为:" + url);
        URL requestUrl = new URL(url);
        // 打开链接
        HttpURLConnection connection = (HttpURLConnection) requestUrl
                .openConnection();
        connection.setRequestProperty("User-Agent",
                "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
        connection.connect();
        
        InputStream is = connection.getInputStream();
        String content;
        while ((is.read()) != -1) {
            int all = is.available();
            byte[] b = new byte[all];
            is.read(b);
            strBuffer.append(new String(b, "UTF-8"));
        }
        if (is != null)
            is.close();
        content = strBuffer.toString();
        System.out.println("content"+content);
        //从中抓取符合条件的url
        Pattern pattern = Pattern.compile("http://202.38.194.240:8080/CSWeb/showTeacher.html\\?id=[0-9]{1,3}");
        Matcher matcher = pattern.matcher(content);
        while (matcher.find()) {
            int start =matcher.start() ;
            int end = matcher.end();
            String href = content.substring(start, end);
            System.out.println("url:"+href);
            urlList.add(href);
        }
        
        return urlList;
    }
    //从各个url中抓取需要的信息
    public List getInfo(List urlList) throws IOException{
        List infoList = new ArrayList();// news集合
        String url = "";
        int count = 0;
        for(int i=0;i<urlList.size();i++){
            url = "";
            url = (String) urlList.get(i);
            System.out.println("请求地址为:" + url);
            URL requestUrl = new URL(url);
            // 打开链接
            HttpURLConnection connection = (HttpURLConnection) requestUrl
                    .openConnection();
            connection.setRequestProperty("User-Agent",
                    "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)");
            connection.connect();
            
            InputStream is = connection.getInputStream();
            String content = "";
            strBuffer = new StringBuffer();
            while ((is.read()) != -1) {
                int all = is.available();
                byte[] b = new byte[all];
                is.read(b);
                strBuffer.append(new String(b, "UTF-8"));
            }
            if (is != null)
                is.close();
            content = strBuffer.toString();
            Pattern pattern = Pattern.compile("研究方向.*。?");
            Matcher matcher = pattern.matcher(content);
            String info = "";
            while (matcher.find()) {
                count++;
                int start =matcher.start() ;
                int end = matcher.end();
                info = content.substring(start, end);
                System.out.println("info"+info);
                infoList.add(info);
            }
        }
        return infoList;
    }
}