Python《使用selenium解决动态加载的问题》

原创

DreamSeaQainXun 2022-12-14 16:32:43 博主文章分类：Python ©著作权

文章标签 selenium python 爬虫 chrome html 文章分类 运维

©著作权归作者所有：来自51CTO博客作者DreamSeaQainXun的原创作品，请联系作者获取转载授权，否则将追究法律责任

爬取的网址呢，还是上一篇博文涉足的 https://www.dmzj.com/ 但是这一次，我们使用selenium来获得每个章节的所有图片，因为动态网页比如常见的js动态生成，用静态方法访问网站并且获取html时，js动态生成的这一部分还没有生成，所以静态方法获取不了这一部分信息。

selenium的webdriver,我个人理解webdriver是模拟用户用浏览器访问网络，只不过不是用户亲自用鼠标是双击，而是用的代码。在这里本人使用的是Firefox浏览器。

这部分代码如下：

# 无头浏览器 这样浏览器就不会弹出那个chrome的web浏览器界面
options = Options()
options.add_argument('--headless')

browser = webdriver.Chrome(chromeExeLoc, options=options)
browser.maximize_window()
browser.implicitly_wait(5)

对于每个页面都去获得那个有所有img的元素。

做个测试

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
import time
from selenium.webdriver.chrome.options import  Options

def getAllIms(opts):
    imglist, taglist = [], []
    for opt in opts:
        imglist.append(opt.get_attribute('value'))
        taglist.append(opt.text[opt.text.find('第') + 1: opt.text.find('页')])
    return imglist, taglist

def test6():
    # 无头浏览器 这样浏览器就不会弹出那个chrome的web浏览器界面
    options = Options()
    options.add_argument('--headless')

    browser = webdriver.Chrome(chromeExeLoc, options=options)
    browser.maximize_window()
    browser.implicitly_wait(5)
    browser.get('https://www.dmzj.com/view/chuanlingwuyu/43878.html#@page=1')
    time.sleep(1)

    opts = browser.find_elements_by_xpath("//div[@class='btmBtnBox']/select/option")
    imglist, taglist = getAllIms(opts)
    print(imglist)
    print(taglist)

if __name__ == "__main__":
    test6()

输出结果如下：
D:\software\Anaconda3\install\envs\pytorch\python.exe D:/software/PyCharm/code/spider9.py
[‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330112801.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/1449233011435.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330119295.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330124847.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330130246.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330140432.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330152702.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14494576363317.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330159679.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330165486.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330174622.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330180602.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330190377.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330192663.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330204047.jpg’, ‘https://images.dmzj.com/img/chapterpic/1247/25304/14492330205342.jpg’]
[‘1’, ‘2’, ‘3’, ‘4’, ‘5’, ‘6’, ‘7’, ‘8’, ‘9’, ‘10’, ‘11’, ‘12’, ‘13’, ‘14’, ‘15’, ‘16’]

Process finished with exit code 0
效果是可以的，现在我们修改之前的代码，当然了，只是修改部分的代码了，大框架可以不用动，只是修改了下载单个章节的逻辑罢了。

完整代码如下：

# coding: utf-8
from concurrent.futures import ThreadPoolExecutor
import time
import os
import requests
from bs4 import BeautifulSoup
from selenium import webdriver

from selenium.webdriver.chrome.options import  Options

rootrurl = 'https://www.dmzj.com'
save_dir = 'D:/estimages/'
chromeExeLoc = 'D:/software/chrome/chromedriver_win32/chromedriver.exe'






headers = {
    "Referer": rootrurl,
    'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
    'Accept-Language': 'en-US,en;q=0.8',
    'Cache-Control': 'max-age=0',
    'Connection': 'keep-alive'
}  ###设置请求的头部，伪装成浏览器



def saveOneImg(dir, img_url, idx):
    new_headers = {
        "Referer": img_url,
        'User-Agent': "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
        'Accept-Language': 'en-US,en;q=0.8',
        'Cache-Control': 'max-age=0',
        'Connection': 'keep-alive'
    }  ###设置请求的头部，伪装成浏览器，实时换成新的 header 是为了防止403 http code问题，防止反盗链，

    try:
        img = requests.get(img_url, headers=new_headers)  # 请求图片的实际URL
        if (str(img).find('200') > 1):
            with open(
                    '{}/{}.jpg'.format(dir, idx), 'wb') as jpg:  # 请求图片并写进去到本地文件
                jpg.write(img.content)
                print(img_url)
                jpg.close()
            return True
        else:
            return False
    except Exception as e:
        print('exception occurs: ' + img_url)
        print(e)


def getAllIms(opts):
    imglist, taglist = [], []
    for opt in opts:
        imglist.append(opt.get_attribute('value'))
        taglist.append(opt.text[opt.text.find('第') + 1: opt.text.find('页')])
    return imglist, taglist


def saveOneCap(dir, href, browser):
    print(href)
    browser.get(href)
    time.sleep(1)

    opts = browser.find_elements_by_xpath("//div[@class='btmBtnBox']/select/option")
    imglist, taglist = getAllIms(opts)

    for i in range(len(imglist)):
        saveOneImg(dir, imglist[i], taglist[i])




def saveOnePageFunc(dir, capters, browser):
    for cap in capters:
        # 按照tag和图片组的内容来创建目录
        new_dir = '{}{}/{}'.format(save_dir, dir, cap.get_text())
        if not os.path.exists(new_dir):
            os.makedirs(new_dir)

        saveOneCap(new_dir, cap.get('href'), browser)
        time.sleep(2)
    pass

# 单个spider thread
def tagSpider(tag, url):
    # 无头浏览器 这样浏览器就不会弹出那个chrome的web浏览器界面
    options = Options()
    options.add_argument('--headless')

    browser = webdriver.Chrome(chromeExeLoc, options=options)
    browser.maximize_window()
    browser.implicitly_wait(5)

    # 解析当前页面
    html = BeautifulSoup(requests.get(url, headers=headers).text, features="html.parser")

    # 提交一个保存页面的任务
    saveOnePageFunc(tag, html.find('ul', {'class': 'list_con_li autoHeight'}).find_all('a'), browser)

    # the job is over.
    print("thread work over. ")
    browser.close()
    browser.quit()


if __name__ == '__main__':

    # 获得所有标签
    taglist = {'川灵物语': 'https://www.dmzj.com/info/chuanlingwuyu.html',
               '魔王与勇者与圣剑神殿': 'https://www.dmzj.com/info/mwyyzysjsd.html',
               '真励之徒弟': 'https://www.dmzj.com/info/zhenlizhitu.html',
               '妖神记': 'https://www.dmzj.com/info/yaoshenji.html'}

    # 给每个标签配备一个线程
    with ThreadPoolExecutor(max_workers=10) as t:  # 创建一个最大容纳数量为20的线程池
        for tag, url in taglist.items():
            t.submit(tagSpider, tag, url)

    # 单个连接测试下下
    # tagSpider('川灵物语', 'https://www.dmzj.com/info/chuanlingwuyu.html')

    # 等待所有线程都完成。
    while 1:
        print('-------------------')
        time.sleep(1)