一、前言
如何用java实现爬网页的照片呢?
二、看代码
package com.expt.ares.web;
import com.alibaba.fastjson2.JSON;
import com.expt.ares.vo.GetImgVO;
import lombok.extern.slf4j.Slf4j;
import org.springframework.web.bind.annotation.PostMapping;
import org.springframework.web.bind.annotation.RequestBody;
import org.springframework.web.bind.annotation.RequestMapping;
import org.springframework.web.bind.annotation.RestController;
import java.io.*;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
@RestController
@RequestMapping("/img")
@Slf4j
public class ImgController {
// 获取img标签正则
private static final String IMGURL_REG = "<img.*src=(.*?)[^>]*?>";
// 获取src路径的正则
// private static final String IMGSRC_REG = "[a-zA-z]+://[^\\s]*";
private static final String IMGSRC_REG = "/uploadfile[^\\\\s]*.jpg";
/**
* 单条抓取
* @param getImgVO
* @throws Exception
*
* eg:
*
* {
* "url":"https://www.xiurenb.vip/XiaoYu/11486#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/1/"
* }
*/
@PostMapping("/getImg")
public void getImg(@RequestBody GetImgVO getImgVO) throws Exception {
String url = getImgVO.getUrl();
String path = getImgVO.getDownloadPath();
String mUrl = new String();
int count = 1;
for (int i = 0; i < 30; i++) {
mUrl = url;
if (i == 0){
mUrl = mUrl.replaceAll("#x#","");
}else {
mUrl = mUrl.replaceAll("#x#","_"+i);
}
log.info(mUrl);
String html = getHtml(mUrl);
log.info(html);
List<String> imageUrl = getImageUrl(html);
log.info(JSON.toJSONString(imageUrl));
List<String> imageSrc = getImageSrc(imageUrl);
log.info(JSON.toJSONString(imageSrc));
count = download(imageSrc,path,count);
mUrl = new String();
}
}
/**
* 批量抓取
* @param getImgVOList
* @throws Exception
*
* eg:
*
* [
* {
* "url":"https://www.xiurenb.vip/XiaoYu/11526#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/7/"
* },
* {
* "url":"https://www.xiurenb.vip/XiuRen/11808#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/4/"
* },
* {
* "url":"https://www.xiurenb.vip/XiaoYu/11775#x#.html",
* "downloadPath":"/Users/wanglei/Documents/imgs/ycc/6/"
* },
* ]
*
*
*/
@PostMapping("/getImgs")
public void getImgs(@RequestBody List<GetImgVO> getImgVOList) throws Exception {
for (GetImgVO vo : getImgVOList){
getImg(vo);
}
}
//获取HTML内容
private String getHtml(String url) throws Exception {
URL url1 = new URL(url);//使用java.net.URL
URLConnection connection = url1.openConnection();//打开链接
InputStream in = connection.getInputStream();//获取输入流
InputStreamReader isr = new InputStreamReader(in);//流的包装
BufferedReader br = new BufferedReader(isr);
String line;
StringBuffer sb = new StringBuffer();
while ((line = br.readLine()) != null) {//整行读取
sb.append(line, 0, line.length());//添加到StringBuffer中
sb.append('\n');//添加换行符
}
//关闭各种流,先声明的后关闭
br.close();
isr.close();
in.close();
return sb.toString();
}
//获取ImageUrl地址
private List<String> getImageUrl(String html) {
Matcher matcher = Pattern.compile(IMGURL_REG).matcher(html);
List<String> listimgurl = new ArrayList<String>();
while (matcher.find()) {
listimgurl.add(matcher.group());
}
return listimgurl;
}
//获取ImageSrc地址
private List<String> getImageSrc(List<String> listimageurl) {
List<String> listImageSrc = new ArrayList<String>();
for (String image : listimageurl) {
Matcher matcher = Pattern.compile(IMGSRC_REG).matcher(image);
while (matcher.find()) {
listImageSrc.add("https://p.xiurenb.top/" + matcher.group().substring(0, matcher.group().length()));
}
}
return listImageSrc;
}
//下载图片
private int download(List<String> listImgSrc,String path,int count) {
try {
//开始时间
Date begindate = new Date();
for (String url : listImgSrc) {
//开始时间
Date begindate2 = new Date();
String imageName = url.substring(url.lastIndexOf("/") + 1, url.length());
URL uri = new URL(url);
InputStream in = uri.openStream();
File file = new File(path);
if (!file.exists() && !file.isDirectory()){
file.mkdirs();
}
FileOutputStream fo = new FileOutputStream(new File(path + count + ".jpg"));//文件输出流
byte[] buf = new byte[1024];
int length = 0;
log.info("开始下载:" + url);
while ((length = in.read(buf, 0, buf.length)) != -1) {
fo.write(buf, 0, length);
}
//关闭流
in.close();
fo.close();
log.info(imageName +"____"+ count + "_____"+ "下载完成");
count = count + 1;
//结束时间
Date overdate2 = new Date();
double time = overdate2.getTime() - begindate2.getTime();
log.info("耗时:" + time / 1000 + "s");
}
Date overdate = new Date();
double time = overdate.getTime() - begindate.getTime();
log.info("总耗时:" + time / 1000 + "s");
} catch (Exception e) {
log.error("下载失败",e);
}
return count;
}
}
三、看结果