playwright获取B站搜索信息

精选原创

进击no猪排 2024-01-13 14:55:01 博主文章分类：Python项目 ©著作权

文章标签 python 爬虫 playwright 文章分类 代码人生

©著作权归作者所有：来自51CTO博客作者进击no猪排的原创作品，请联系作者获取转载授权，否则将追究法律责任

简介

使用自动化工具playwright根据输入关键词，获取b站所有的搜索结果的视频标题，视频链接。

playwright获取B站搜索信息_python

playwright获取B站搜索信息_爬虫_02

一、css

观察网页结构，右键进入检查页获取css定位。先填充查找信息后点击搜索。

playwright获取B站搜索信息_playwright_03

 page.locator(".nav-search-input").fill(msg)
 page.locator(".nav-search-btn").click()

进入到搜索界面后，同样进行分析

playwright获取B站搜索信息_playwright_04

 res = page1.locator(".bili-video-card__info--right > a").all()
 for i in res:
   link = "https:" + i.get_attribute("href")
   title = i.locator("h3").inner_text()

每获取一页进行点击下一页

playwright获取B站搜索信息_playwright_05

try:
	next_button.click()
	page1.wait_for_timeout(1000)
except Exception:
	break

完整代码如下：

import csv
from playwright.sync_api import sync_playwright

def main(msg):
    with sync_playwright() as p:
        browser = p.chromium.launch(executable_path=r"C:\Users\PC\AppData\Local\ms-playwright\chromium-1091\chrome-win\chrome.exe")
        context = browser.new_context()
        page = context.new_page()
        page.goto("https://www.bilibili.com/")
        with page.expect_popup() as page1_info:
            page.locator(".nav-search-input").fill(msg)
            page.locator(".nav-search-btn").click()
        page1 = page1_info.value
        page1.wait_for_timeout(1000)
        with open('test.csv', 'w', newline='', encoding='utf-8') as f:
            col_names = ['标题', '链接']
            writer = csv.DictWriter(f, fieldnames=col_names)
            writer.writeheader()
            while True:
                res = page1.locator(".bili-video-card__info--right > a").all()
                for i in res:
                    link = "https:" + i.get_attribute("href")
                    title = i.locator("h3").inner_text()
                    print(f"标题: {title}, 链接: {link}")
                    writer.writerow({'标题': title, '链接': link})
                next_button = page1.get_by_role("button", name="下一页")
                try:
                    next_button.click()
                    page1.wait_for_timeout(1000)
                except Exception:
                    break
        browser.close()

if __name__ == "__main__":
    msg = input("请输入查询信息：")
    main(msg)

二、xpath

也支持使用xpath表达式

playwright获取B站搜索信息_python_06

 page.locator('xpath=//*[@id="nav-searchform"]/div[1]/input').fill(msg)

观察xpath路径结构

playwright获取B站搜索信息_爬虫_07

只需修改中间的div[i]即可，但要注意首页和其他页的结构不一样。

完整代码：

import csv
from playwright.sync_api import sync_playwright

def main(msg):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False,executable_path=r"C:\Users\PC\AppData\Local\ms-playwright\chromium-1091\chrome-win\chrome.exe")
        context = browser.new_context()
        page = context.new_page()
        page.goto("https://www.bilibili.com/")
        with page.expect_popup() as page1_info:
            page.locator('xpath=//*[@id="nav-searchform"]/div[1]/input').fill(msg)
            page.locator('xpath=//*[@id="nav-searchform"]/div[2]').click()
        page1 = page1_info.value
        page1.wait_for_timeout(1000)

        with open('test2.csv', 'w', newline='', encoding='utf-8') as f:
            col_names = ['标题', '链接']
            writer = csv.DictWriter(f, fieldnames=col_names)
            writer.writeheader()

            res = page1.locator('xpath=//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[2]/div')
            for i in range(1, 31):
                try:
                    link = "https:" + res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a').get_attribute("href")
                    title = res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a/h3').get_attribute("title")
                    print(f"第1页！！！！！{i}！！！！！标题: {title}, 链接: {link}")
                    writer.writerow({'标题': title, '链接': link})
                except:
                    break
            next_button = page1.get_by_role("button", name="下一页")

            try:
                next_button.click()
                page1.wait_for_timeout(1000)
            except Exception:
                browser.close()
            page = 1
            while True:
                page+=1
                res = page1.locator('xpath=//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]')
                for i in range(1, 31):
                    try:
                        link = "https:" + res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a').get_attribute("href")
                        title = res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a/h3').get_attribute("title")
                        print(f"第{page}页！！！！！{i}！！！！！标题: {title}, 链接: {link}")
                        writer.writerow({'标题': title, '链接': link})
                    except:
                        break

                next_button = page1.get_by_role("button", name="下一页")

                try:
                    next_button.click()
                    page1.wait_for_timeout(1000)

                except Exception:
                    break
        browser.close()


if __name__ == "__main__":
    msg = input("请输入查询信息：")
    main(msg)