import re from time import sleep from lxml import etree from selenium import webdriver options = webdriver.ChromeOptions() #options.add_argument('--headless') options.add_argument( "User-Agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36") options.add_argument("Referer=https://s.weibo.com/") options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') options.add_argument('blink-settings=imagesEnabled=false') options.add_argument('--disable-gpu') options.add_argument('--hide-scrollbars') # 隐藏滚动条, 应对一些特殊页面 options.add_argument( 'Cookie: ') class Qidian: def __init__(self, url, driver): self.url = url self.driver = driver content = self.get_content(url) self.file_name = self.pase_file_name(content) def crawl_start(self): content = self.get_content(self.url) self.parse_detail(content) def get_content(self,url): self.driver.get(url) content = driver.page_source return content def pase_file_name(self, content): html = etree.HTML(content) file_info = html.xpath('//*[@id="info"]/h1/text()') file_name = file_info[0] + ".txt" return file_name def parse_detail(self, content): html = etree.HTML(content) ul = html.xpath('//div[@id="list"]/dl//dd') open(self.file_name, 'w') for li in ul: item = {} title = li.xpath('./a/text()') href = li.xpath('./a/@href') item['title'] = title[0] item['href'] = "http://www.biquge.info/0_273/" + href[0] print(item) driver.get(item['href']) html = etree.HTML(driver.page_source) details = html.xpath('//*[@id="content"]//text()') detail = ''.join(details) self.save_to_file(self.file_name, title[0], detail) sleep(3) def save_to_file(self, file_name, title, content): with open(file_name, 'a+') as f: f.write(title + '\n') f.write(content) f.write('\n') f.close() if __name__ == "__main__": url = "http://www.biquge.info/0_273/" driver = webdriver.Chrome(options=options) try: qidian = Qidian(url, driver) qidian.crawl_start() driver.quit() except Exception as e: print(str(e))