1 package com.test.pic.crawler;
2
3 import java.io.File;
4 import java.io.FileOutputStream;
5 import java.io.IOException;
6 import java.io.InputStream;
7 import java.io.OutputStream;
8 import java.net.URL;
9 import java.net.URLConnection;
10 import java.util.Arrays;
11 import java.util.HashSet;
12 import java.util.List;
13 import java.util.Set;
14 import java.util.concurrent.BlockingQueue;
15 import java.util.concurrent.LinkedBlockingDeque;
16 import java.util.concurrent.ScheduledExecutorService;
17 import java.util.concurrent.ScheduledThreadPoolExecutor;
18 import org.apache.commons.lang3.concurrent.BasicThreadFactory;
19 import org.jsoup.Jsoup;
20 import org.jsoup.nodes.*;
21 import org.jsoup.select.Elements;
22
23
24
25
26 /**
27 * @Title: PicCrawler.java
28 *
29 * @Package com.test.pic.crawler
30 *
31 * @Description: 爬取指定网站的指定Tag下的图片或者全部Tag图片
32 *
33 * @author CoderZZ
34 *
35 * @date 2018年1月12日 下午11:22:41
36 *
37 * @version V1.0
38 *
39 */
40 public class PicCrawler implements Runnable{
41 private static String pathString = "G:/test/pic/";//存储目录
42 //存储真正的爬取页面
43 static BlockingQueue<String> urlBlockingQueue = new LinkedBlockingDeque<String>(1000);
44 static int threadNum = 10;
45 // public PicCrawler(String url){
46 // this.url = url;
47 // }
48
49 /**
50 * @Title: main
51 *
52 * @Description: TODO(这里用一句话描述这个方法的作用)
53 *
54 * @param @param args 设定文件
55 *
56 * @return void 返回类型
57 *
58 * @throws
59 *
60 */
61 public static void main(String[] args) {
62 String homeurlString = "https://www.xxxx.com";//爬取页面的基本地址
63 String tagPageUrl = "https://www.xxxx.com/tag/";//tag分页地址
64 //Tag标签的完整路径
65 Set<String> tagFullHrefSet = new HashSet<String>(16);
66 //想要爬取哪些tag,如果为空,则全部爬取;否则只配置对应的tag
67 String[] crawlerTagArray = {"风景"};
68 List<String> crawlerTagList = Arrays.asList(crawlerTagArray);
69 try {
70 //1.获取想要的tag完整的url
71 Document tagListDocument = Jsoup.connect(tagPageUrl).get();
72 Elements tagsListDivElements = tagListDocument.getElementsByClass("tags_list");
73 for(Element element:tagsListDivElements){
74 Elements aElements = element.getElementsByTag("a");
75 for(Element a:aElements){
76 if(crawlerTagList.size() == 0 || crawlerTagList.contains(a.text())){
77 String tagUrlString = homeurlString+a.attr("href");
78 //https://www.xxxx.com/tag/fengjing.html
79 tagUrlString = tagUrlString.substring(0, tagUrlString.lastIndexOf("."))+"/1.html";
80 tagFullHrefSet.add(tagUrlString);
81 }
82 }
83 }
84 //2.获取图片链接页面地址,分页爬取
85 for(String tagUrl:tagFullHrefSet){
86 String tempTagUrlString = tagUrl;
87 int currentPageNum = 1;
88 while(true){
89 try{
90 Document imagePageDocument = Jsoup.connect(tempTagUrlString).get();
91 Elements imageListElements = imagePageDocument.getElementsByClass("Pli-litpic");
92 if(imageListElements.size() == 0){
93 break;
94 }
95 for(Element image:imageListElements){
96 urlBlockingQueue.offer(homeurlString+image.attr("href"));
97 }
98 //https://www.xxxx.com/tag/fengjing/1.html
99 tempTagUrlString = tempTagUrlString.substring(0, tempTagUrlString.lastIndexOf("/")+1)+(++currentPageNum)+".html";
100 }catch(Exception e){
101 break;
102 }
103 }
104 }
105 ScheduledExecutorService excutor = new ScheduledThreadPoolExecutor(threadNum,new BasicThreadFactory.Builder().namingPattern("my-crawler-thread-%d").daemon(false).build());
106 for(int i=0;i<threadNum;i++){
107 // excutor.schedule(new PicCrawler(urlArray[i]), 1, TimeUnit.SECONDS);
108 // excutor.execute(new PicCrawler(urlArray[i]));
109 excutor.submit(new PicCrawler());
110 }
111 } catch (IOException e) {
112 // TODO Auto-generated catch block
113 e.printStackTrace();
114 }
115 }
116 @Override
117 public void run() {
118 while (true) {
119 try {
120 long begin = System.currentTimeMillis();
121 String url = urlBlockingQueue.poll();
122 if(null != url){
123 Document doc = Jsoup.connect(url).get();
124 Elements titleElements =doc.select("#photos > h1");
125 if(null != titleElements && null != titleElements.get(0)){
126 Set<String> imgSrcSet = new HashSet<String>(16);
127 Element titleElement = titleElements.get(0);
128 String foldNameString = titleElement.text();
129 String[] nameArray = foldNameString.split("\\(");
130 foldNameString = nameArray[0];
131 nameArray = nameArray[1].split("/");
132 int totalPaggs = Integer.parseInt(nameArray[1].replace(")", ""));
133 for(int i=1;i<=totalPaggs;i++){
134 String urlTemp = url.replace(".html", "_"+i+".html");
135 Document docTemp = Jsoup.connect(urlTemp).get();
136 Element element = docTemp.getElementById("big-pic");
137 Elements imgElements = element.getElementsByTag("img");
138 for(Element imgElement:imgElements){
139 imgSrcSet.add(imgElement.attr("src"));
140 }
141 }
142 if(imgSrcSet.size()>0){
143 for(String imgSrc:imgSrcSet){
144 // 构造URL
145 URL imgurl = new URL(imgSrc);
146 // 打开连接
147 URLConnection con = imgurl.openConnection();
148 //设置请求超时为10s
149 con.setConnectTimeout(10*1000);
150 // 输入流
151 InputStream is = con.getInputStream();
152 // 500k的数据缓冲
153 byte[] bs = new byte[1024*500];
154 // 读取到的数据长度
155 int len;
156 // 输出的文件流
157 File sf=new File(pathString+"\\"+foldNameString);
158 if(!sf.exists()){
159 sf.mkdirs();
160 }
161 String filename = imgSrc.split("/")[imgSrc.split("/").length-1];
162 OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);
163 // 开始读取
164 while ((len = is.read(bs)) != -1) {
165 os.write(bs, 0, len);
166 }
167 // 完毕,关闭所有链接
168 os.close();
169 is.close();
170 System.out.println(imgSrc+"下载完成!!!");
171 }
172 }
173 long end = System.currentTimeMillis();
174 System.out.println("================================================================");
175 System.out.println(Thread.currentThread().getName()+"******************已全部下载完成,用时:"+((end-begin)/1000)+"S");
176 }
177 }else{
178 System.out.println("========================BlockingQueue已空,已全部抓取完成!=======================");
179 }
180 } catch (Exception e) {
181 System.out.println("========================抓取异常=======================");
182 }
183 }
184 }
185 }