简介

使用自动化工具playwright根据输入关键词,获取b站所有的搜索结果的视频标题,视频链接。

playwright获取B站搜索信息_playwright

playwright获取B站搜索信息_爬虫_02

一、css

观察网页结构,右键进入检查页获取css定位。先填充查找信息后点击搜索。

playwright获取B站搜索信息_playwright_03

 page.locator(".nav-search-input").fill(msg)
 page.locator(".nav-search-btn").click()

进入到搜索界面后,同样进行分析

playwright获取B站搜索信息_python_04

 res = page1.locator(".bili-video-card__info--right > a").all()
 for i in res:
   link = "https:" + i.get_attribute("href")
   title = i.locator("h3").inner_text()

每获取一页进行点击下一页

playwright获取B站搜索信息_python_05

try:
	next_button.click()
	page1.wait_for_timeout(1000)
except Exception:
	break

完整代码如下:

import csv
from playwright.sync_api import sync_playwright

def main(msg):
    with sync_playwright() as p:
        browser = p.chromium.launch(executable_path=r"C:\Users\PC\AppData\Local\ms-playwright\chromium-1091\chrome-win\chrome.exe")
        context = browser.new_context()
        page = context.new_page()
        page.goto("https://www.bilibili.com/")
        with page.expect_popup() as page1_info:
            page.locator(".nav-search-input").fill(msg)
            page.locator(".nav-search-btn").click()
        page1 = page1_info.value
        page1.wait_for_timeout(1000)
        with open('test.csv', 'w', newline='', encoding='utf-8') as f:
            col_names = ['标题', '链接']
            writer = csv.DictWriter(f, fieldnames=col_names)
            writer.writeheader()
            while True:
                res = page1.locator(".bili-video-card__info--right > a").all()
                for i in res:
                    link = "https:" + i.get_attribute("href")
                    title = i.locator("h3").inner_text()
                    print(f"标题: {title}, 链接: {link}")
                    writer.writerow({'标题': title, '链接': link})
                next_button = page1.get_by_role("button", name="下一页")
                try:
                    next_button.click()
                    page1.wait_for_timeout(1000)
                except Exception:
                    break
        browser.close()

if __name__ == "__main__":
    msg = input("请输入查询信息:")
    main(msg)

二、xpath

也支持使用xpath表达式

playwright获取B站搜索信息_爬虫_06

 page.locator('xpath=//*[@id="nav-searchform"]/div[1]/input').fill(msg)

观察xpath路径结构

playwright获取B站搜索信息_playwright_07

只需修改中间的div[i]即可,但要注意首页和其他页的结构不一样。

完整代码:

import csv
from playwright.sync_api import sync_playwright

def main(msg):
    with sync_playwright() as p:
        browser = p.chromium.launch(headless=False,executable_path=r"C:\Users\PC\AppData\Local\ms-playwright\chromium-1091\chrome-win\chrome.exe")
        context = browser.new_context()
        page = context.new_page()
        page.goto("https://www.bilibili.com/")
        with page.expect_popup() as page1_info:
            page.locator('xpath=//*[@id="nav-searchform"]/div[1]/input').fill(msg)
            page.locator('xpath=//*[@id="nav-searchform"]/div[2]').click()
        page1 = page1_info.value
        page1.wait_for_timeout(1000)

        with open('test2.csv', 'w', newline='', encoding='utf-8') as f:
            col_names = ['标题', '链接']
            writer = csv.DictWriter(f, fieldnames=col_names)
            writer.writeheader()

            res = page1.locator('xpath=//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div/div[2]/div')
            for i in range(1, 31):
                try:
                    link = "https:" + res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a').get_attribute("href")
                    title = res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a/h3').get_attribute("title")
                    print(f"第1页!!!!!{i}!!!!!标题: {title}, 链接: {link}")
                    writer.writerow({'标题': title, '链接': link})
                except:
                    break
            next_button = page1.get_by_role("button", name="下一页")

            try:
                next_button.click()
                page1.wait_for_timeout(1000)
            except Exception:
                browser.close()
            page = 1
            while True:
                page+=1
                res = page1.locator('xpath=//*[@id="i_cecream"]/div/div[2]/div[2]/div/div/div[1]')
                for i in range(1, 31):
                    try:
                        link = "https:" + res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a').get_attribute("href")
                        title = res.locator(f'xpath=/div[{i}]/div/div[2]/div/div/a/h3').get_attribute("title")
                        print(f"第{page}页!!!!!{i}!!!!!标题: {title}, 链接: {link}")
                        writer.writerow({'标题': title, '链接': link})
                    except:
                        break

                next_button = page1.get_by_role("button", name="下一页")

                try:
                    next_button.click()
                    page1.wait_for_timeout(1000)

                except Exception:
                    break
        browser.close()


if __name__ == "__main__":
    msg = input("请输入查询信息:")
    main(msg)