【python】爬取CSDN博客文章（保存为html，txt，md)

原创

井底知蛙 2022-06-30 17:07:48 博主文章分类：运维 ©著作权

文章标签 html chrome safari 文章分类 Python 后端开发

©著作权归作者所有：来自51CTO博客作者井底知蛙的原创作品，请联系作者获取转载授权，否则将追究法律责任

def crawl(url):
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/95.0.4638.54 Safari/537.36",
    }
    print("crawl...")
    # 配置header破反爬
    response = requests.get(url, headers=headers)
    # 200就继续
    if response.status_code == 200:
        html = response.content.decode("utf8")
        # print(html)
        tree = etree.HTML(html)
        print("look for text...")
        # 找到需要的html块
        title = tree.xpath('//*[@id="articleContentId"]/text()')[0]
        block = tree.xpath('//*[@id="content_views"]')
        # html
        ohtml = unescape(etree.tostring(block[0]).decode("utf8"))
        # 纯文本
        text = block[0].xpath('string(.)').strip()
        # print("html:", ohtml)
        # print("text:", text)
        print("title:", title)
        save(ohtml, text)
        # 完成！
        print("finish!")
    else:
        print("failed!")


def save(html, text):
    if "output" not in os.listdir():
        # 不存在输出文件夹就创建
        os.mkdir("output")
        os.mkdir("output/html")
        os.mkdir("output/text")
        os.mkdir("output/markdown")
    with open(f"output/html/{title}.html", 'w', encoding='utf8') as html_file:
        # 保存html
        print("write html...")
        html_file.write(html)
    with open(f"output/text/{title}.txt", 'w', encoding='utf8') as txt_file:
        # 保存纯文本
        print("write text...")
        txt_file.write(text)
    with open(f"output/markdown/{title}.md", 'w', encoding='utf8') as md_file:
        # 保存markdown
        print("write markdown...")
        text_maker = HTML2Text()
        # md转换
        md_text = text_maker.handle(html)
        md_file.write(md_text)


if __name__ == '__main__':
    # 你想要爬取的文章url
    url = "url"
    crawl(url)

感谢大佬帮助