Python-网页爬虫与Sqlite3

转载

mb601a5ca21139e 2021-02-09 19:53:55

# -*- coding: utf-8 -*-

import sqlite3
import requests
from bs4 import BeautifulSoup
from re import escape

if __name__ == '__main__':
    conn = sqlite3.connect('Python.db')
    c = conn.cursor()
    c.execute('''CREATE TABLE IF NOT EXISTS Python (
        Url VARCHAR,
        Title VARCHAR,
        Author VARCHAR
    )''')
    conn.commit()

    # --------------------Split Line--------------------
    headers = {
        "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36"
    }

    for i in range(1, 1046):
        url = "http://xxx/index_%s.html" % str(i)
        req = requests.get(url=url, headers=headers)
        req.encoding = "utf-8"
        html = BeautifulSoup(req.text, "lxml")

        # --------------------Split Line--------------------
        for div in html.find_all('div', class_='loop'):
            content_body = div.select('h2 > a')[0]
            content_infor = div.select('.content_infor > span:nth-child(3)')[0]

            # --------------------Split Line--------------------
            cursor = c.execute(
                "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href')))
            len = 0
            for row in cursor:
                len = row[0]
            if len > 0:
                continue

            # --------------------Split Line--------------------
            c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % (
                "http://xxx" + content_body.get('href'),
                escape(content_body.get('title').replace("\"", "\"\"")),
                content_infor.text.replace('xxx: ', '')))

        conn.commit()
        print("第%s页" % str(i))

    # --------------------Split Line--------------------
    conn.close()

本文章为转载内容，我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题，欢迎原作者联系我们进行内容更正或删除文章。