# -*- coding: utf-8 -*- import sqlite3 import requests from bs4 import BeautifulSoup from re import escape if __name__ == '__main__': conn = sqlite3.connect('Python.db') c = conn.cursor() c.execute('''CREATE TABLE IF NOT EXISTS Python ( Url VARCHAR, Title VARCHAR, Author VARCHAR )''') conn.commit() # --------------------Split Line-------------------- headers = { "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/50.0.2661.87 Safari/537.36" } for i in range(1, 1046): url = "http://xxx/index_%s.html" % str(i) req = requests.get(url=url, headers=headers) req.encoding = "utf-8" html = BeautifulSoup(req.text, "lxml") # --------------------Split Line-------------------- for div in html.find_all('div', class_='loop'): content_body = div.select('h2 > a')[0] content_infor = div.select('.content_infor > span:nth-child(3)')[0] # --------------------Split Line-------------------- cursor = c.execute( "SELECT COUNT(*) FROM Python WHERE Url = '%s'" % ("http://xxx" + content_body.get('href'))) len = 0 for row in cursor: len = row[0] if len > 0: continue # --------------------Split Line-------------------- c.execute('INSERT INTO Python( Url, Title, Author) VALUES ( "%s", "%s", "%s")' % ( "http://xxx" + content_body.get('href'), escape(content_body.get('title').replace("\"", "\"\"")), content_infor.text.replace('xxx: ', ''))) conn.commit() print("第%s页" % str(i)) # --------------------Split Line-------------------- conn.close()
Python-网页爬虫与Sqlite3
转载本文章为转载内容,我们尊重原作者对文章享有的著作权。如有内容错误或侵权问题,欢迎原作者联系我们进行内容更正或删除文章。

提问和评论都可以,用心的回复会被更多人看到
评论
发布评论
相关文章
-
python 与sqlite3
有一个解密chrome cookie的事情,google出了代码,却不能正常执行
sqlite ci sql -
python sqlite3 operation
python sqlite3 类重写操作
python sqlite3