java爬网页图片到本地

原创

wx645606081424e 2023-05-06 18:15:40 ©著作权

文章标签 java json 开发语言 List html 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者wx645606081424e的原创作品，请联系作者获取转载授权，否则将追究法律责任

一、前言

如何用java实现爬网页的照片呢？

二、看代码

package com.expt.ares.web;

import com.alibaba.fastjson2.JSON;
import com.expt.ares.vo.GetImgVO;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;

import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

@RestController
@RequestMapping("/img")
@Slf4j
public class ImgController {

    // 获取img标签正则
    private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
    // 获取src路径的正则
//    private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*";
    private static final String IMGSRC_REG = "/uploadfile[^\\\\s]*.jpg";


    /**
     * 单条抓取
     * @param getImgVO
     * @throws Exception
     *
     * eg：
     *
     * {
     *     "url":"https://www.xiurenb.vip/XiaoYu/11486#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/1/"
     * }
     */
    @PostMapping("/getImg")
    public void getImg(@RequestBody GetImgVO getImgVO) throws Exception {
        String url = getImgVO.getUrl();
        String path = getImgVO.getDownloadPath();
        String mUrl = new String();
        int count = 1;
        for (int i = 0; i < 30; i++) {
            mUrl = url;
            if (i == 0){
                mUrl = mUrl.replaceAll("#x#","");
            }else {
                mUrl = mUrl.replaceAll("#x#","_"+i);
            }
            log.info(mUrl);
            String html = getHtml(mUrl);
            log.info(html);
            List<String> imageUrl = getImageUrl(html);
            log.info(JSON.toJSONString(imageUrl));

            List<String> imageSrc = getImageSrc(imageUrl);
            log.info(JSON.toJSONString(imageSrc));

            count = download(imageSrc,path,count);
            mUrl = new String();
        }
    }


    /**
     * 批量抓取
     * @param getImgVOList
     * @throws Exception
     *
     * eg:
     *
     * [
     *     {
     *     "url":"https://www.xiurenb.vip/XiaoYu/11526#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/7/"
     *     },
     *     {
     *     "url":"https://www.xiurenb.vip/XiuRen/11808#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/4/"
     *     },
     *     {
     *     "url":"https://www.xiurenb.vip/XiaoYu/11775#x#.html",
     *     "downloadPath":"/Users/wanglei/Documents/imgs/ycc/6/"
     *     },
     * ]
     *
     *
     */
    @PostMapping("/getImgs")
    public void getImgs(@RequestBody List<GetImgVO> getImgVOList) throws Exception {
        for (GetImgVO vo : getImgVOList){
            getImg(vo);
        }
    }


    //获取HTML内容
    private String getHtml(String url) throws Exception {
        URL url1 = new URL(url);//使用java.net.URL
        URLConnection connection = url1.openConnection();//打开链接
        InputStream in = connection.getInputStream();//获取输入流
        InputStreamReader isr = new InputStreamReader(in);//流的包装
        BufferedReader br = new BufferedReader(isr);

        String line;
        StringBuffer sb = new StringBuffer();
        while ((line = br.readLine()) != null) {//整行读取
            sb.append(line, 0, line.length());//添加到StringBuffer中
            sb.append('\n');//添加换行符
        }
        //关闭各种流，先声明的后关闭
        br.close();
        isr.close();
        in.close();
        return sb.toString();
    }

    //获取ImageUrl地址
    private List<String> getImageUrl(String html) {
        Matcher matcher = Pattern.compile(IMGURL_REG).matcher(html);
        List<String> listimgurl = new ArrayList<String>();
        while (matcher.find()) {
            listimgurl.add(matcher.group());
        }
        return listimgurl;
    }

    //获取ImageSrc地址
    private List<String> getImageSrc(List<String> listimageurl) {
        List<String> listImageSrc = new ArrayList<String>();
        for (String image : listimageurl) {
            Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
            while (matcher.find()) {
                listImageSrc.add("https://p.xiurenb.top/" + matcher.group().substring(0, matcher.group().length()));
            }
        }
        return listImageSrc;
    }

    //下载图片
    private int download(List<String> listImgSrc,String path,int count) {
        try {
            //开始时间
            Date begindate = new Date();
            for (String url : listImgSrc) {
                //开始时间
                Date begindate2 = new Date();
                String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
                URL uri = new URL(url);
                InputStream in = uri.openStream();
                File file = new File(path);
                if (!file.exists() && !file.isDirectory()){
                    file.mkdirs();
                }
                FileOutputStream fo = new FileOutputStream(new File(path + count + ".jpg"));//文件输出流
                byte[] buf = new byte[1024];
                int length = 0;
                log.info("开始下载:" + url);
                while ((length = in.read(buf, 0, buf.length)) != -1) {
                    fo.write(buf, 0, length);
                }
                //关闭流
                in.close();
                fo.close();
                log.info(imageName +"____"+ count + "_____"+ "下载完成");
                count = count + 1;
                //结束时间
                Date overdate2 = new Date();
                double time = overdate2.getTime() - begindate2.getTime();
                log.info("耗时：" + time / 1000 + "s");
            }
            Date overdate = new Date();
            double time = overdate.getTime() - begindate.getTime();
            log.info("总耗时：" + time / 1000 + "s");

        } catch (Exception e) {
            log.error("下载失败",e);
        }
        return count;
    }
}