【起点阅读】java小说爬虫
写一个可以在起点网站爬小说的爬虫
缺点就是vip无法完整的爬取
废话不多说,上代码了】pom.xml 完整各种包的引用
<parent>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-parent</artifactId>
<version>2.0.5.RELEASE</version>
<relativePath/> <!-- lookup parent from repository -->
</parent>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
<project.reporting.outputEncoding>UTF-8</project.reporting.outputEncoding>
<java.version>1.8</java.version>
</properties>
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
<dependency>
<groupId>org.springframework.boot</groupId>
<artifactId>spring-boot-starter-test</artifactId>
<scope>test</scope>
</dependency>
</dependencies>
主要的就是,下面这些jar
因为我准备写成web的所以加了spring boot
不用加也能用,
<dependencies>
<dependency>
<groupId>org.apache.httpcomponents</groupId>
<artifactId>httpclient</artifactId>
<version>4.5.2</version>
</dependency>
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version>1.6.1</version>
</dependency>
<dependency>
<groupId>org.projectlombok</groupId>
<artifactId>lombok</artifactId>
<optional>true</optional>
</dependency>
java代码
import lombok.Data;
import org.jsoup.nodes.Document;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.ReentrantReadWriteLock;
/*
* TODO: 小说辅助类
* @author VAIE
* @date: 2018/11/2-21:47
* @version v1.0
*/
@Data
public class NovelAssist {
String url;//网络地址
Queue<Map<String,List<Document>>> directoryQueue;//节章标题队列
Queue<List<Map<String,String>>> NovelList;//列表队列
ReentrantReadWriteLock listRwl;//列表读写锁
ReentrantReadWriteLock directoryRwl;//列表读写锁
CountDownLatch latch;//信号量
String titleRule;//标题规则
String contentsRule;//内容规则
String listRule;//列表规则
String nextRule;//下一章规则
String directoryRule;//目录的规则
}
import com.outdd.toolbox.reptile.pojo.NovelAssist;
import com.outdd.toolbox.reptile.service.NovelService;
import com.outdd.toolbox.reptile.thread.GetDirectoryToIoThread;
import com.outdd.toolbox.reptile.thread.GetDirectoryToQueueThread;
import com.outdd.toolbox.reptile.thread.GetNovelListThread;
import com.outdd.toolbox.reptile.util.ReptileUtil;
import org.jsoup.nodes.Document;
import org.springframework.stereotype.Service;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.Executors;
import java.util.concurrent.locks.ReentrantReadWriteLock;
/**
*
*接口实现
*/
@Service
public class NovelServiceImpl implements NovelService {
ExecutorService es =Executors.newCachedThreadPool();//线程池
volatile Queue<Map<String, List<Document>>> queue = new LinkedList<Map<String, List<Document>>>();
volatile Queue<List<Map<String,String>>> NovelList = new LinkedList<List<Map<String,String>>>();//
@Override
public void getNovelByAll(String url) {
int xiet = 4;
CountDownLatch latch = new CountDownLatch(xiet);//两个工人的协作
ReentrantReadWriteLock listRwl = new ReentrantReadWriteLock();
ReentrantReadWriteLock directoryRwl = new ReentrantReadWriteLock();
NovelAssist novelAssist = new NovelAssist();
novelAssist.setUrl(url);
novelAssist.setLatch(latch);
novelAssist.setDirectoryQueue(queue);
novelAssist.setNovelList(NovelList);
novelAssist.setListRwl(listRwl);
novelAssist.setDirectoryRwl(directoryRwl);
novelAssist.setListRule(".cf li .book-mid-info h4 a");
novelAssist.setNextRule(".lbf-pagination-next");
novelAssist.setDirectoryRule(".volume-wrap ul li a");
novelAssist.setTitleRule(".j_chapterName");
novelAssist.setContentsRule(".j_readContent");
es.submit(new GetNovelListThread(novelAssist));
es.submit(new GetDirectoryToQueueThread(novelAssist));
es.submit(new GetDirectoryToQueueThread(novelAssist));
es.submit(new GetDirectoryToIoThread(novelAssist));
try {
latch.await();
} catch (InterruptedException e) {
e.printStackTrace();
}
System.out.println("结束了");
}
@Override
public List<Map<String,String>> getNovelByName(String NovelName) {
String url = "https://www.qidian.com/search?kw="+NovelName;
String rule =".book-mid-info h4 a";
return ReptileUtil.getNovelByUrlToList(url,rule);
}
}
import com.outdd.toolbox.common.util.HttpUtils;
import com.outdd.toolbox.reptile.pojo.NovelAssist;
import com.outdd.toolbox.reptile.util.ReptileUtil;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Document;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;
/**
* 获取小说列表线程
*/
@Slf4j
public class GetNovelListThread implements Runnable {
public GetNovelListThread(NovelAssist na) {
this.NovelList = na.getNovelList();
this.w = na.getListRwl().writeLock();
this.latch = na.getLatch();
this.url = na.getUrl();
this.listRule = na.getListRule();
this.nextRule = na.getNextRule();
}
Lock w;//写锁
CountDownLatch latch;//信号量
String url;//网络地址
String listRule;
String nextRule;
HttpUtils httpUtils = HttpUtils.getInstance();
Queue<List<Map<String, String>>> NovelList;
@Override
public void run() {
getAllNovelList(url, listRule, nextRule);
}
/**
* TODO: 获取全部小说列表
*
* @param url
* @param listRule 列表规则
* @param nextRule 下一章规则
* @return: java.util.Queue<java.util.Map < java.lang.String , java.lang.String>>
* @auther: vaie
* @date: 2018/11/1 21:05
*/
public void getAllNovelList(String url, String listRule, String nextRule) {
boolean filag = true;
try {
log.info(Thread.currentThread().getName()+"开启");
while (filag) {
Document doc = ReptileUtil.getDocumentOfHttps(url);
w.lock();//上了局部写锁
NovelList.offer(ReptileUtil.getNovelByUrlToList(doc,listRule));
System.out.println("cg");
w.unlock();
Map<String, Object> map = ReptileUtil.isNext(doc, nextRule);
filag = (Boolean) map.get("filag");
url = (String) map.get("url");
while (NovelList.size() > 500){
log.info("NovelList过大。获取小说列表休眠10分钟");
Thread.sleep(1000*60*30);
}
}
log.info("获取列表完成");
} catch (Exception e) {
log.error("获取列表异常");
e.printStackTrace();
} finally {
w.unlock();
log.info("释放列表写锁");
latch.countDown();
log.info("信号量-1");
}
}
}
import com.outdd.toolbox.common.util.CommomUtil;
import com.outdd.toolbox.reptile.pojo.NovelAssist;
import com.outdd.toolbox.reptile.util.ReptileUtil;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.util.*;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;
/*
* TODO: 获取目录线程
* @author VAIE
* @date: 2018/11/2-22:43
* @version v1.0
*/
@Slf4j
public class GetDirectoryToQueueThread implements Runnable{
public GetDirectoryToQueueThread(NovelAssist na){
this.r = na.getListRwl().readLock();
this.w = na.getDirectoryRwl().writeLock();
this.directoryRule = na.getDirectoryRule();
this.directoryQueue = na.getDirectoryQueue();
this.NovelList = na.getNovelList();
this.latch = na.getLatch();
}
String directoryRule;
Queue<List<Map<String,String>>> NovelList;
Queue<Map<String,List<Document>>> directoryQueue;
Lock w;//写锁
Lock r;//读锁
CountDownLatch latch;//信号量
@Override
public void run() {
getNovelDirectory(directoryRule);
}
/**
* TODO: 获取全部目录
* @param directoryRule 目录的规则
* @return: void
* @auther: vaie
* @date: 2018/11/1 21:12
*/
public void getNovelDirectory(String directoryRule){
log.info(Thread.currentThread().getName()+"目录开启");
boolean filag = true;
try {
while (filag) {
if (NovelList.size() > 0) {
r.lock();//获取局部读锁
List<Map<String,String>> novelList=NovelList.poll();
r.unlock();
for(Map<String,String> map:novelList){
String url=map.get("url");
String title=map.get("title");
Document doc = ReptileUtil.getDocumentOfHttps(url);
if(CommomUtil.isNotNull(doc)){
Elements titleUrls = doc.select(directoryRule);//标题
Map<String,List<Document>> fdmap = new HashMap<String,List<Document>>();
List<Document> fdList=new ArrayList<Document>();
for (Element titleUrl : titleUrls) {
Document document = ReptileUtil.getDocumentOfHttps("https:"+titleUrl.attr("href"));
fdList.add(document);
}
fdmap.put(title,fdList);
w.lock();
directoryQueue.offer(fdmap);
w.unlock();
}
}
} else {
synchronized (NovelList) {
while(NovelList.size() == 0) {
try {
log.info("列表队列空,等待数据");
Thread.sleep(10000);
log.info("唤醒队列检测");
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}
log.info("读取目录执行完毕");
}catch (Exception e){
log.error("读目录出错");
e.printStackTrace();
}finally {
r.unlock();//释放局部读锁
log.info("释放目录读锁");
latch.countDown();
log.info("信号量-1");
}
}
}
import com.outdd.toolbox.common.util.io.NovelIo;
import com.outdd.toolbox.reptile.pojo.NovelAssist;
import com.outdd.toolbox.reptile.util.ReptileUtil;
import lombok.extern.slf4j.Slf4j;
import org.jsoup.nodes.Document;
import java.util.List;
import java.util.Map;
import java.util.Queue;
import java.util.concurrent.CountDownLatch;
import java.util.concurrent.locks.Lock;
/**
* 获取目录向硬盘写入数据线程
*/
@Slf4j
public class GetDirectoryToIoThread implements Runnable {
public GetDirectoryToIoThread(NovelAssist na) {
this.directoryQueue = na.getDirectoryQueue();
this.r = na.getDirectoryRwl().readLock();
this.latch = na.getLatch();
this.titleRule=na.getTitleRule();
this.contentsRule=na.getContentsRule();
}
Lock r; //读取锁
CountDownLatch latch; //信号量
Queue<Map<String,List<Document>>> directoryQueue;//目录队列
String titleRule;//标题规则
String contentsRule;//内容规则
NovelIo no = new NovelIo();//小说io类
@Override
public void run() {
read(titleRule,contentsRule);
}
/**
* TODO: 从队列中获取网页文件
*
* @param titleRule 标题规则
* @param contentsRule 内容规则
* @return: void
* @auther: bjxdts
* @date: 2018/10/31 15:13
*/
public void read(String titleRule,String contentsRule) {
log.info(Thread.currentThread().getName()+"开启");
boolean filag = true;
try {
while (filag) {
if (directoryQueue.size() > 0) {
r.lock();
Map<String,List<Document>> map = directoryQueue.poll();
r.unlock();
for (String fileName:map.keySet()){
StringBuilder Details=new StringBuilder();
for (Document doc:map.get(fileName)){
Details.append(ReptileUtil.getDetails(doc,titleRule,contentsRule));
}
no.write(Details.toString(), fileName);
}
} else {
synchronized (directoryQueue) {
while(directoryQueue.size() == 0) {
try {
log.info("目录队列空,等待数据");
Thread.sleep(30000);
log.info("唤醒队列检测");
} catch (InterruptedException e) {
e.printStackTrace();
}
}
}
}
}
log.info("获取目录进硬盘执行完毕");
} catch (Exception e) {
log.error("读目录进硬盘出错");
} finally {
r.unlock();
log.info("释放目录读锁");
latch.countDown();
log.info("信号量-1");
}
}
}
import com.outdd.toolbox.common.util.CommomUtil;
import com.outdd.toolbox.common.util.HttpUtils;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import java.net.MalformedURLException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.concurrent.locks.Lock;
/*
* TODO: 爬虫工具类
* @author VAIE
* @date: 2018/11/2-22:22
* @version v1.0
*/
public class ReptileUtil {
static HttpUtils httpUtils = HttpUtils.getInstance();
/**
* TODO: 根据url获取Document
*
* @param url
* @param lock lock锁
* @return: org.jsoup.nodes.Document
* @auther: vaie
* @date: 2018/11/2 22:16
*/
public static Document getDocumentOfHttps(String url, Lock lock) {
Document document = null;
lock.lock();//获取写锁
try {
document = httpUtils.executeGetWithSSLAsDocument(url);
} catch (Exception e) {
e.printStackTrace();
} finally {
lock.unlock();//释放写锁
}
return document;
}
/**
* TODO: 获取Document通过HTTPS
*
* @param url 网络地址
* @return: org.jsoup.nodes.Document
* @auther: vaie
* @date: 2018/11/2 21:10
*/
public static Document getDocumentOfHttps(String url) {
Document document = null;
try {
document = httpUtils.executeGetWithSSLAsDocument(url);
} catch (Exception e) {
e.printStackTrace();
}
return document;
}
/**
* TODO: 获取域名
*
* @param url
* @return: java.lang.String
* @auther: bjxdts
* @date: 2018/10/31 15:07
*/
public String getHost(String url) throws MalformedURLException {
java.net.URL Url = new java.net.URL(url);
return "https://" + Url.getHost();// 获取主机名;
}
/**
* TODO: 查询是否有下一章
* @param doc
* @param nextName 下一章的规则id民或者class名
* @return: String url - boolen filag 需要强制转换
* @auther: bjxdts
* @date: 2018/10/31 14:52
*/
public static Map<String, Object> isNext(Document doc, String nextName) throws MalformedURLException {
Map<String, Object> map = new HashMap<String, Object>();
boolean filag = true;
Elements next = doc.select(nextName);
if (next.size() > 0) {
map.put("url", "https:" + next.get(0).attr("href"));
} else {
filag = false;
}
map.put("filag", filag);
return map;
}
/**
* TODO: 获取小说内容
* @param doc Document
* @param titleRule 标题规则
* @param contentsRule 内容规则
* @return: java.lang.String
* @auther: vaie
* @date: 2018/11/2 20:58
*/
public static String getDetails(Document doc,String titleRule,String contentsRule){
StringBuffer resultContent = new StringBuffer();
Elements titles = doc.select(titleRule);//标题
Elements contents = doc.select(contentsRule);//内容
for (Element title : titles) {
resultContent.append(title.text() + "\r\n");
}
for (Element content : contents) {
resultContent.append(content.text().replaceAll(" ", "\r\n") + "\r\n");
}
return resultContent.toString();
}
/**
* TODO: 通过地址获取目录标题和地址
* @param url 网络地址
* @param rule 规则
* @return: java.util.List<java.util.Map<java.lang.String,java.lang.String>>
* @auther: vaie
* @date: 2018/11/3 17:59
*/
public static List<Map<String,String>> getNovelByUrlToList(String url,String rule){
Document doc = getDocumentOfHttps(url);
List<Map<String, String>> list = new ArrayList<Map<String, String>>();
if (CommomUtil.isNotNull(doc)) {
Elements titleUrls = doc.select(rule);//标题
for (Element titleUrl : titleUrls) {
Map<String,String> map = new HashMap<String,String>();
map.put("title", titleUrl.text());
map.put("url", "https:" + titleUrl.attr("href"));
list.add(map);
}
}
return list;
}
/**
* TODO: 通过已解析的网页文件获取目录标题和地址
* @param doc 已解析的网页文件
* @param rule 规则
* @return: java.util.List<java.util.Map<java.lang.String,java.lang.String>>
* @auther: vaie
* @date: 2018/11/3 17:59
*/
public static List<Map<String,String>> getNovelByUrlToList(Document doc,String rule){
List<Map<String, String>> list = new ArrayList<Map<String, String>>();
if (CommomUtil.isNotNull(doc)) {
Elements titleUrls = doc.select(rule);//标题
for (Element titleUrl : titleUrls) {
Map<String,String> map = new HashMap<String,String>();
map.put("title", titleUrl.text());
map.put("url", "https:" + titleUrl.attr("href"));
list.add(map);
}
}
return list;
}
}
io类。这里修改存在哪里
import java.io.*;
import java.util.Date;
/*
*io工具类
*/
public class NovelIo {
public void write(String details,String fileName){
String path = "D://xiaoshuo/"+fileName +".txt";//这个改
FileWriter fw;
File file = new File(path);
//获取父目录
File fileParent = file.getParentFile();
//判断是否存在
if (!fileParent.exists()) {
//创建父目录文件
fileParent.mkdirs();
}
try{
fw = new FileWriter(file,true);
PrintWriter bw = new PrintWriter(fw);
bw.println(details);
bw.flush();
bw.close();
}
catch(IOException e){
// TODO Auto-generated catch blocke.
System.out.println("写人失败");
e.printStackTrace();
}
}
}
ps:这个http类不是我写的。是从网上拷的。
import org.apache.http.HttpEntity;
import org.apache.http.NameValuePair;
import org.apache.http.client.CookieStore;
import org.apache.http.client.config.RequestConfig;
import org.apache.http.client.entity.UrlEncodedFormEntity;
import org.apache.http.client.methods.CloseableHttpResponse;
import org.apache.http.client.methods.HttpGet;
import org.apache.http.client.methods.HttpPost;
import org.apache.http.client.protocol.HttpClientContext;
import org.apache.http.conn.ssl.SSLConnectionSocketFactory;
import org.apache.http.cookie.Cookie;
import org.apache.http.entity.ContentType;
import org.apache.http.entity.StringEntity;
import org.apache.http.impl.client.CloseableHttpClient;
import org.apache.http.impl.client.HttpClientBuilder;
import org.apache.http.impl.client.HttpClients;
import org.apache.http.message.BasicNameValuePair;
import org.apache.http.ssl.SSLContextBuilder;
import org.apache.http.util.EntityUtils;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import javax.net.ssl.*;
import java.io.IOException;
import java.security.GeneralSecurityException;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
/**
* <pre>
* Http工具,包含:
* 普通http请求工具(使用httpClient进行http,https请求的发送)
* </pre>
* Created by xuyh at 2017/7/17 19:08.
*/
public class HttpUtils {
/**
* 请求超时时间,默认20000ms
*/
private int timeout = 20000;
/**
* cookie表
*/
private Map<String, String> cookieMap = new HashMap<>();
/**
* 请求编码(处理返回结果),默认UTF-8
*/
private String charset = "UTF-8";
private static HttpUtils httpUtils;
private HttpUtils() {
}
/**
* 获取实例
*
* @return
*/
public static HttpUtils getInstance() {
if (httpUtils == null)
httpUtils = new HttpUtils();
return httpUtils;
}
/**
* 清空cookieMap
*/
public void invalidCookieMap() {
cookieMap.clear();
}
public int getTimeout() {
return timeout;
}
/**
* 设置请求超时时间
*
* @param timeout
*/
public void setTimeout(int timeout) {
this.timeout = timeout;
}
public String getCharset() {
return charset;
}
/**
* 设置请求字符编码集
*
* @param charset
*/
public void setCharset(String charset) {
this.charset = charset;
}
/**
* 将网页返回为解析后的文档格式
*
* @param html
* @return
* @throws Exception
*/
public static Document parseHtmlToDoc(String html) throws Exception {
return removeHtmlSpace(html);
}
private static Document removeHtmlSpace(String str) {
Document doc = Jsoup.parse(str);
String result = doc.html().replace(" ", "");
return Jsoup.parse(result);
}
/**
* 执行get请求,返回doc
*
* @param url
* @return
* @throws Exception
*/
public Document executeGetAsDocument(String url) throws Exception {
return parseHtmlToDoc(executeGet(url));
}
/**
* 执行get请求
*
* @param url
* @return
* @throws Exception
*/
public String executeGet(String url) throws Exception {
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpClient httpClient = null;
String str = "";
try {
httpClient = HttpClientBuilder.create().build();
HttpClientContext context = HttpClientContext.create();
CloseableHttpResponse response = httpClient.execute(httpGet, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
int state = response.getStatusLine().getStatusCode();
if (state == 404) {
str = "";
}
try {
HttpEntity entity = response.getEntity();
if (entity != null) {
str = EntityUtils.toString(entity, charset);
}
} finally {
response.close();
}
} catch (IOException e) {
throw e;
} finally {
try {
if (httpClient != null)
httpClient.close();
} catch (IOException e) {
throw e;
}
}
return str;
}
/**
* 用https执行get请求,返回doc
*
* @param url
* @return
* @throws Exception
*/
public Document executeGetWithSSLAsDocument(String url) throws Exception {
return parseHtmlToDoc(executeGetWithSSL(url));
}
public static String httpGetHeader(String url,String cook,String header) throws IOException{
//获取请求连接
Connection con = Jsoup.connect(url);
//请求头设置,特别是cookie设置
con.header("Accept", "text/html, application/xhtml+xml, */*");
con.header("Content-Type", "application/x-www-form-urlencoded");
con.header("User-Agent", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0))");
con.header("Cookie", cook);
//发送请求
Connection.Response resp=con.method(Connection.Method.GET).execute();
//获取cookie名称为__bsi的值
String cookieValue = resp.cookie("__bsi");
System.out.println("cookie __bsi值: "+cookieValue);
//获取返回cookie所值
Map<String,String> cookies = resp.cookies();
System.out.println("所有cookie值: "+cookies);
//获取返回头文件值
String headerValue = resp.header(header);
System.out.println("头文件"+header+"的值:"+headerValue);
//获取所有头文件值
Map<String,String> headersOne =resp.headers();
System.out.println("所有头文件值:"+headersOne);
return headerValue;
}
/**
* 用https执行get请求
*
* @param url
* @return
* @throws Exception
*/
public String executeGetWithSSL(String url) throws Exception {
HttpGet httpGet = new HttpGet(url);
httpGet.setHeader("Cookie", convertCookieMapToString(cookieMap));
httpGet.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpClient httpClient = null;
String str = "";
try {
httpClient = createSSLInsecureClient();
HttpClientContext context = HttpClientContext.create();
CloseableHttpResponse response = httpClient.execute(httpGet, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
int state = response.getStatusLine().getStatusCode();
if (state == 404) {
str = "";
}
try {
HttpEntity entity = response.getEntity();
if (entity != null) {
str = EntityUtils.toString(entity, charset);
}
} finally {
response.close();
}
} catch (IOException e) {
throw e;
} catch (GeneralSecurityException ex) {
throw ex;
} finally {
try {
if (httpClient != null)
httpClient.close();
} catch (IOException e) {
throw e;
}
}
return str;
}
/**
* 执行post请求,返回doc
*
* @param url
* @param params
* @return
* @throws Exception
*/
public Document executePostAsDocument(String url, Map<String, String> params) throws Exception {
return parseHtmlToDoc(executePost(url, params));
}
/**
* 执行post请求
*
* @param url
* @param params
* @return
* @throws Exception
*/
public String executePost(String url, Map<String, String> params) throws Exception {
String reStr = "";
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
List<NameValuePair> paramsRe = new ArrayList<>();
for (String key : params.keySet()) {
paramsRe.add(new BasicNameValuePair(key, params.get(key)));
}
CloseableHttpClient httpclient = HttpClientBuilder.create().build();
CloseableHttpResponse response;
try {
httpPost.setEntity(new UrlEncodedFormEntity(paramsRe));
HttpClientContext context = HttpClientContext.create();
response = httpclient.execute(httpPost, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
HttpEntity entity = response.getEntity();
reStr = EntityUtils.toString(entity, charset);
} catch (IOException e) {
throw e;
} finally {
httpPost.releaseConnection();
}
return reStr;
}
/**
* 用https执行post请求,返回doc
*
* @param url
* @param params
* @return
* @throws Exception
*/
public Document executePostWithSSLAsDocument(String url, Map<String, String> params) throws Exception {
return parseHtmlToDoc(executePostWithSSL(url, params));
}
/**
* 用https执行post请求
*
* @param url
* @param params
* @return
* @throws Exception
*/
public String executePostWithSSL(String url, Map<String, String> params) throws Exception {
String re = "";
HttpPost post = new HttpPost(url);
List<NameValuePair> paramsRe = new ArrayList<>();
for (String key : params.keySet()) {
paramsRe.add(new BasicNameValuePair(key, params.get(key)));
}
post.setHeader("Cookie", convertCookieMapToString(cookieMap));
post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpResponse response;
try {
CloseableHttpClient httpClientRe = createSSLInsecureClient();
HttpClientContext contextRe = HttpClientContext.create();
post.setEntity(new UrlEncodedFormEntity(paramsRe));
response = httpClientRe.execute(post, contextRe);
HttpEntity entity = response.getEntity();
if (entity != null) {
re = EntityUtils.toString(entity, charset);
}
getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
} catch (Exception e) {
throw e;
}
return re;
}
/**
* 发送JSON格式body的POST请求
*
* @param url 地址
* @param jsonBody json body
* @return
* @throws Exception
*/
public String executePostWithJson(String url, String jsonBody) throws Exception {
String reStr = "";
HttpPost httpPost = new HttpPost(url);
httpPost.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
httpPost.setHeader("Cookie", convertCookieMapToString(cookieMap));
CloseableHttpClient httpclient = HttpClientBuilder.create().build();
CloseableHttpResponse response;
try {
httpPost.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
HttpClientContext context = HttpClientContext.create();
response = httpclient.execute(httpPost, context);
getCookiesFromCookieStore(context.getCookieStore(), cookieMap);
HttpEntity entity = response.getEntity();
reStr = EntityUtils.toString(entity, charset);
} catch (IOException e) {
throw e;
} finally {
httpPost.releaseConnection();
}
return reStr;
}
/**
* 发送JSON格式body的SSL POST请求
*
* @param url 地址
* @param jsonBody json body
* @return
* @throws Exception
*/
public String executePostWithJsonAndSSL(String url, String jsonBody) throws Exception {
String re = "";
HttpPost post = new HttpPost(url);
post.setHeader("Cookie", convertCookieMapToString(cookieMap));
post.setConfig(RequestConfig.custom().setSocketTimeout(timeout).setConnectTimeout(timeout).build());
CloseableHttpResponse response;
try {
CloseableHttpClient httpClientRe = createSSLInsecureClient();
HttpClientContext contextRe = HttpClientContext.create();
post.setEntity(new StringEntity(jsonBody, ContentType.APPLICATION_JSON));
response = httpClientRe.execute(post, contextRe);
HttpEntity entity = response.getEntity();
if (entity != null) {
re = EntityUtils.toString(entity, charset);
}
getCookiesFromCookieStore(contextRe.getCookieStore(), cookieMap);
} catch (Exception e) {
throw e;
}
return re;
}
private void getCookiesFromCookieStore(CookieStore cookieStore, Map<String, String> cookieMap) {
List<Cookie> cookies = cookieStore.getCookies();
for (Cookie cookie : cookies) {
cookieMap.put(cookie.getName(), cookie.getValue());
}
}
private String convertCookieMapToString(Map<String, String> map) {
String cookie = "";
for (String key : map.keySet()) {
cookie += (key + "=" + map.get(key) + "; ");
}
if (map.size() > 0) {
cookie = cookie.substring(0, cookie.length() - 2);
}
return cookie;
}
/**
* 创建 SSL连接
*
* @return
* @throws GeneralSecurityException
*/
private static CloseableHttpClient createSSLInsecureClient() throws GeneralSecurityException {
try {
SSLContext sslContext = new SSLContextBuilder().loadTrustMaterial(null, (chain, authType) -> true).build();
SSLConnectionSocketFactory sslConnectionSocketFactory = new SSLConnectionSocketFactory(sslContext,
(s, sslContextL) -> true);
return HttpClients.custom().setSSLSocketFactory(sslConnectionSocketFactory).build();
} catch (GeneralSecurityException e) {
throw e;
}
}
}
这里说明测试
import com.outdd.toolbox.reptile.service.NovelService;
import com.outdd.toolbox.reptile.service.impl.NovelServiceImpl;
import org.junit.Test;
public class Test1 {
//这个是起点免费小说列表的入口
String url="https://www.qidian.com/finish?action=hidden&orderId=&page=1&vip=0&style=1&pageSize=20&siteid=1&pubflag=0&hiddenField=2";
@Test
public void test1(){
NovelService novelService = new NovelServiceImpl();
novelService.getNovelByAll(url);
//执行这个方法线程会一直挂起直到将整个免费小说全部存入本地硬盘
//这里写的本地地址是:D:/xiaoshuo/
//如果你觉得不喜欢存在d盘,可在io类里该
}
}
大致流程-这里可以匹配其他网站的,只需要将NovelServiceImpl类的
novelAssist.setListRule(".cf li .book-mid-info h4 a");
novelAssist.setNextRule(".lbf-pagination-next");
novelAssist.setDirectoryRule(".volume-wrap ul li a");
novelAssist.setTitleRule(".j_chapterName");
novelAssist.setContentsRule(".j_readContent");
里面的规则改一下就行了
各个规则匹配什么在NovelAssist类里已经写上注释了,看看
代码不是很好,将就看吧,放在能用
完结傻花!!!!!!!!!!!!!!!!!!