镜像源(加快下载模块)
国内常用的镜像源有 :
阿里云 http://mirrors.aliyun.com/pypi/simple/
中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/
豆瓣(douban) http://pypi.douban.com/simple/
清华大学 https://pypi.tuna.tsinghua.edu.cn/simple/
中国科学技术大学 http://pypi.mirrors.ustc.edu.cn/simple/
pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple/
1. 爬虫基础(贴吧案例)
爬虫的基本操作
百度贴吧小案例(request的使用)
# 打开url,封装请求
from urllib.request import urlopen, Request
# 将字典封装为网站参数
from urllib.parse import urlencode
# 随机UA(防止ip被封)
from fake_useragent import UserAgent
# 获取html
def get_html(url):
headers = {
'User-Agent': UserAgent().chrome
}
request = Request(url, headers=headers)
response = urlopen(request)
# print(response.read().decode())
return response.read()
# 保存html到本地
def save_html(filename, html_bytes):
with open(filename, 'wb') as f:
print('正在保存' + filename)
f.write(html_bytes)
def main():
context = input('请输入要下载的内容:')
num = input('请输入要下载的页数:')
base_url = 'https://tieba.baidu.com/f?ie=utf-8&{}'
for pn in range(int(num)):
args = {
'pn': pn * 50,
'kw': context
}
args = urlencode(args)
# print(args)
# print(base_url.format(args))
filename = '第' + str(pn + 1) + '页.html'
print('开始下载' + filename)
html_bytes = get_html(base_url.format(args))
save_html(filename, html_bytes)
print(filename + '下载完成')
if __name__ == '__main__':
main()
2. post请求的使用
给网站发送post请求,传递post参数
from urllib.request import urlopen, Request
from urllib.parse import urlencode
from fake_useragent import UserAgent
url = 'http://www.zengqiang.club/admin/login'
form_data = {
'username': '曾强',
'password': 'ZQZ981004'
}
# print(urlencode(form_data))
headers = {
'User-Agent': UserAgent().random
}
# print(headers)
f_data = urlencode(form_data)
request = Request(url, data=f_data.encode(), headers=headers)
response = urlopen(request)
print(response.read().decode())
3. ajax请求的抓取(豆瓣电影排行榜信息抓取)
ajax主要是要在network下去查看url
找到url的参数的含义和规律
循环的去访问网址获取数据
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
base_url = 'https://movie.douban.com/j/chart/top_list?type=5&interval_id=100%3A90&action=&start={}&limit=20'
i = 0
while True:
headers = {
'User-Agent': UserAgent().random
}
# 将base_url中{}代表的参数传入,封装为完整的url
url = base_url.format(i * 50)
request = Request(url, headers=headers)
response = urlopen(request)
info = response.read().decode()
if len(info) < 10:
break
print(info)
i += 1
4. 代理的使用
使用代理防止过多的访问导致ip被封
使用网站的其他ip
from urllib.request import Request, build_opener
from urllib.request import ProxyHandler
from fake_useragent import UserAgent
url = 'http://httpbin.org/get'
headers = {
'User-Agent': UserAgent().chrome
}
request = Request(url, headers=headers)
# 两种方式:(1是购买使用,2是免费的,网上找)
# handler = ProxyHandler({'http':'username:password@ip:port'})
# handler = ProxyHandler({'http':'ip:port'})
handler = ProxyHandler({'http': '39.137.107.98:80'})
# 封装成自己的opener
opener = build_opener(handler)
# 用自定义的opener去发出请求
response = opener.open(request)
print(response.read().decode())
5. cookie的使用
使用cookie来完成需要登录而访问的页面
两种方式:直接使用cookie和保存cookie到文件在加载使用
from urllib.request import Request, HTTPCookieProcessor, build_opener
from urllib.parse import urlencode
from fake_useragent import UserAgent
# 登录
login_url = 'http://www.zengqiang.club/admin/login'
headers = {
'User-Agent': UserAgent().chrome
}
form_date = {
'username': '曾强',
'password': 'ZQZ981004'
}
f_date = urlencode(form_date).encode()
request = Request(login_url, headers=headers, data=f_date)
handler = HTTPCookieProcessor()
opener = build_opener(handler)
opener.open(request)
# 登录成功
url = 'http://www.zengqiang.club/admin/blogs'
request = Request(url, headers=headers)
response = opener.open(request)
print(response.read().decode())
from urllib.request import Request, HTTPCookieProcessor, build_opener
from urllib.parse import urlencode
from fake_useragent import UserAgent
from http.cookiejar import MozillaCookieJar
# 登录
# 保存cookie到文件
def get_cookie():
login_url = 'http://www.zengqiang.club/admin/login'
headers = {
'User-Agent': UserAgent().chrome
}
form_date = {
'username': '曾强',
'password': 'ZQZ981004'
}
f_date = urlencode(form_date).encode()
request = Request(login_url, headers=headers, data=f_date)
cookie_jar = MozillaCookieJar()
handler = HTTPCookieProcessor(cookie_jar)
opener = build_opener(handler)
opener.open(request)
cookie_jar.save('cookie.txt', ignore_expires=True, ignore_discard=True)
# 加载cookie
# 范文页面
def use_cookie():
url = 'http://www.zengqiang.club/admin/blogs'
headers = {
'User-Agent': UserAgent().chrome
}
request = Request(url, headers=headers)
cookie_jar = MozillaCookieJar()
cookie_jar.load('cookie.txt',ignore_expires=True,ignore_discard=True)
handler = HTTPCookieProcessor(cookie_jar)
opener = build_opener(handler)
response = opener.open(request)
print(response.read().decode())
if __name__ == '__main__':
get_cookie()
use_cookie()
6. URLError的使用
异常处理try,except
from urllib.request import Request, urlopen
from fake_useragent import UserAgent
from urllib.error import URLError
url = 'http://www.zengqiang.club/1.html'
headers = {
'User-Agent': UserAgent().random
}
try:
request = Request(url, headers=headers)
response = urlopen(request)
print(response.read().decode())
except URLError as e:
if e.args == ():
print(e.code)
else:
print(e.args[0].errno)
7. requests的使用
requests比urllib更方便,代码更少
import requests
from fake_useragent import UserAgent
# get请求
# url = 'https://www.baidu.com/s'
# headers = {
# 'User-Agent': UserAgent().chrome
# }
# params = {
# 'wd': '重庆文理学院'
# }
# response = requests.get(url, headers=headers, params=params)
# response.encoding = 'utf-8'
#
# print(response.url)
# post请求
url = 'http://www.zengqiang.club/admin/login'
form_data = {
'username': '曾强',
'password': 'ZQZ981004'
}
headers = {
'User-Agent': UserAgent().random
}
response = requests.post(url, data=form_data, headers=headers)
print(response.text)
8. re的使用(正则表达式)
记忆常用的表达式
import re
str = 'I love you6.6 forever'
print('-------match()从字符串的起始位置开始匹配---------')
m1 = re.match(r'I', str)
m2 = re.match(r'\w', str)
m3 = re.match(r'.', str)
m4 = re.match(r'\D', str)
m5 = re.match(r'\S', str)
m6 = re.match(r'i', str, re.I)
print(m6.group())
print('-------serach()扫描整个字符串并返回第一个成功的匹配---------')
s1 = re.search(r'love', str)
s2 = re.search(r'l\w+', str)
s3 = re.search(r'y\w+.\d', str)
print(s3.group())
print('-------findAll()查找全部---------')
f1 = re.findall(r'o', str)
print(f1)
print('--------练习---------')
str1 = '<div><a href="http://www.python.com">python官网</a></div>'
t1 = re.findall(r'p\w+[\u4e00-\u9fa5]', str1)
t2 = re.findall(r'<a href="http://www.python.com">(.+)</a>', str1)
t3 = re.findall(r'<a href="(.+)">', str1)
print(t3)
print('---------sub() 替换字符串-------')
su1 = re.sub(r'<div>(.+)</div>', r'<span>\1</span>', str1)
print(su1)
9. 使用re爬取本网站的首页的博客标题
import requests
from fake_useragent import UserAgent
import re
url = 'http://www.zengqiang.club/'
headers = {
'User-Agent': UserAgent().random
}
response = requests.get(url, headers=headers)
# print(response.text)
info = response.text
result = re.findall(r'<a href="/blog/\d+" target="_blank" class="m-black m-text-thin">(.+)</a>', info)
print(result)
10. bs4的使用
方便提取html代码中我们需要的内容
from bs4 import BeautifulSoup
from bs4.element import Comment
# 这里需要安装lxml模块
# 国内常用的镜像源有 :
# 阿里云 http://mirrors.aliyun.com/pypi/simple/
# 中国科技大学 https://pypi.mirrors.ustc.edu.cn/simple/
# 豆瓣(douban) http://pypi.douban.com/simple/
# 清华大学 https://pypi.tuna.tsinghua.edu.cn/simple/
# 中国科学技术大学 http://pypi.mirrors.ustc.edu.cn/simple/
# pip install lxml -i https://pypi.tuna.tsinghua.edu.cn/simple/
str = '''
<title>尚学堂</title>
<div class='info' float='left'>Welcome to SXT</div>
<div class='info' float='right'>
<span>Good Good Study</span>
<a href='www.bjsxt.cn'></a>
<strong><!--没用--></strong>
</div>
'''
soup = BeautifulSoup(str, 'lxml')
print(soup.title)
print(soup.div)
print(soup.div.attrs)
print(soup.div.get('class'))
print(soup.div.get('float'))
print(soup.a['href'])
print(soup.div.string)
print(soup.div.text)
print(soup.strong.string)
print(type(soup.strong.string))
if type(soup.strong.string) == Comment:
print(soup.strong.string)
print(soup.strong.prettify())
else:
print(soup.strong.text)
str1 = '''
<title id="title">尚学堂</title>
<div class='info' id="info" float='left'>Welcome to SXT</div>
<div class='info' float='right'>
<span>Good Good Study</span>
<a href='www.bjsxt.cn'></a>
<strong><!--没用--></strong>
</div>
'''
print('------------find_all()-------------')
soup1 = BeautifulSoup(str1, 'lxml')
print(soup1.find_all('title'))
print(soup1.find_all(id='title'))
print(soup1.find_all(class_='info')) # class是关键字
print(soup1.find_all(attrs={'float': 'left'}))
print('------------select() css选择器-------------')
print(soup1.select('title'))
print(soup1.select('#title'))
print(soup1.select('.info'))
print(soup1.select('div > span')) # < 两边要有空格
print(soup1.select('div span'))
print(soup1.select('div'))
print(soup1.select('div')[1])
print(soup1.select('div')[1].select('span'))
print(soup1.select('title')[0].text)
11. xpath的使用
xpath用于获取html想要的内容
爬取起点中文网的部分书籍的书名和作者
from lxml import html
import requests
from fake_useragent import UserAgent
url = "https://www.qidian.com/rank/yuepiao?chn=21"
headers = {
'User_Agent': UserAgent().random
}
response = requests.get(url, headers=headers)
etree = html.etree
e = etree.HTML(response.text)
names = e.xpath('//h4/a/text()')
authors = e.xpath('//p[@class="author"]/a[1]/text()')
# for num in range(len(names)):
# print(names[num], ":", authors[num])
for name, author in zip(names, authors):
print(name, ":", author)
# print(names)
# print(authors)
12. pyquery的使用
pyquer用于获取html想要的内容
爬取西刺代理的ip数据
from pyquery import PyQuery as pq
import requests
from fake_useragent import UserAgent
url = 'https://www.xicidaili.com/nn/'
headers = {
'User-Agent': UserAgent().chrome
}
response = requests.get(url, headers=headers)
doc = pq(response.text)
strs = doc('#ip_list tr')
for num in range(1, len(strs)):
ip = strs.eq(num).find('td').eq(1).text()
port = strs.eq(num).find('td').eq(2).text()
type = strs.eq(num).find('td').eq(5).text()
print(ip, ":", port, "----", type)
13. json的使用
主要是json与string之间的转换
import json
str = '{"name":"我的小世界"}'
print(type(str))
# 将字符串转为json对象
obj = json.loads(str)
print(type(obj), ":", obj)
# 将json对象转为字符串
str1 = json.dumps(obj, ensure_ascii=False)
print(type(str1), ":", str1)
# 保存json到文件
json.dump(obj, open('json.txt', 'w', encoding='utf-8'), ensure_ascii=False)
# 从文件中提取数据
str2 = json.load(open('json.txt', encoding='utf-8'))
print(str2)
14. jsonpath的使用
爬取json数据中我们需要的内容
from jsonpath import jsonpath
import requests
from fake_useragent import UserAgent
import json
# json在线解析:https://www.json.cn/
url = 'https://www.lagou.com/lbs/getAllCitySearchLabels.json'
headers = {
'User-Agent': UserAgent().chrome
}
response = requests.get(url, headers=headers)
# 两种方式将response转换为json对象
city_names = jsonpath(json.loads(response.text), '$..name')
city_codes = jsonpath(response.json(), '$..code')
for city_name, city_code in zip(city_names, city_codes):
print(city_name, ":", city_code)
15. 多线程的使用
多线程主要是为了提高爬取的效率
爬取段子网的段子数据(面向对象)
from threading import Thread
from fake_useragent import UserAgent
import requests
from lxml import html
from queue import Queue
# 爬取网页类
class Spider_html(Thread):
def __init__(self, url_queue, html_queue):
Thread.__init__(self)
self.url_queue = url_queue
self.html_queue = html_queue
def run(self):
headers = {
'User-Agent': UserAgent().random
}
while self.url_queue.empty() == False:
url = self.url_queue.get()
response = requests.get(url, headers=headers)
if response.status_code == 200:
self.html_queue.put(response.text)
# 解析类
class ParseInfo(Thread):
def __init__(self, html_queue):
Thread.__init__(self)
self.html_queue = html_queue
def run(self):
etree = html.etree
while self.html_queue.empty() == False:
e = etree.HTML(self.html_queue.get())
contents = e.xpath('//div[@class="post-content"]/p/text()')
# print(contents)
with open('duanzi.txt','a',encoding='utf-8')as f:
for content in contents:
info = content
# 控制一行一个段子方便查看
f.write(info+'\n')
if __name__ == '__main__':
# 存储url
url_queue = Queue()
# 存储内容html
html_queue = Queue()
base_url = 'https://duanziwang.com/category/%E7%BB%8F%E5%85%B8%E6%AE%B5%E5%AD%90/{}'
for i in range(1, 11):
new_url = base_url.format(i)
url_queue.put(new_url)
# print(new_url)
# 爬取网页
spider_html_list = []
# 开启三个线程
for i in range(0, 3):
spider1 = Spider_html(url_queue, html_queue)
spider_html_list.append(spider1)
spider1.start()
for spider_html in spider_html_list:
spider_html.join()
# 解析网页,获取需要的内容
parse_list = []
for i in range(0, 3):
parse = ParseInfo(html_queue)
parse_list.append(parse)
parse.start()
for parse in parse_list:
parse.join()
16. 云打码的使用
云打码主要用于处理登录时输入的验证码
需要给一点钱和注册账号
http://www.yundama.com/
# 打码工具类
import http.client, mimetypes, urllib, json, time, requests
from PIL import Image
######################################################################
class YDMHttp:
apiurl = 'http://api.yundama.com/api.php'
username = ''
password = ''
appid = ''
appkey = ''
def __init__(self, username, password, appid, appkey):
self.username = username
self.password = password
self.appid = str(appid)
self.appkey = appkey
def request(self, fields, files=[]):
response = self.post_url(self.apiurl, fields, files)
response = json.loads(response)
return response
def balance(self):
data = {'method': 'balance', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['balance']
else:
return -9001
def login(self):
data = {'method': 'login', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey}
response = self.request(data)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['uid']
else:
return -9001
def upload(self, filename, codetype, timeout):
data = {'method': 'upload', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'codetype': str(codetype), 'timeout': str(timeout)}
file = {'file': filename}
response = self.request(data, file)
if (response):
if (response['ret'] and response['ret'] < 0):
return response['ret']
else:
return response['cid']
else:
return -9001
def result(self, cid):
data = {'method': 'result', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'cid': str(cid)}
response = self.request(data)
return response and response['text'] or ''
def decode(self, filename, codetype, timeout):
cid = self.upload(filename, codetype, timeout)
if (cid > 0):
for i in range(0, timeout):
result = self.result(cid)
if (result != ''):
return cid, result
else:
time.sleep(1)
return -3003, ''
else:
return cid, ''
def report(self, cid):
data = {'method': 'report', 'username': self.username, 'password': self.password, 'appid': self.appid,
'appkey': self.appkey, 'cid': str(cid), 'flag': '0'}
response = self.request(data)
if (response):
return response['ret']
else:
return -9001
def post_url(self, url, fields, files=[]):
for key in files:
files[key] = open(files[key], 'rb');
res = requests.post(url, files=files, data=fields)
return res.text
######################################################################
def get_code(filename):
# 用户名
username = 'zq666_yh'
# 密码
password = 'ZQZ981004'
# 软件ID,开发者分成必要参数。登录开发者后台【我的软件】获得!
appid = 10039
# 软件密钥,开发者分成必要参数。登录开发者后台【我的软件】获得!
appkey = 'f6248169a3f9857b57e778c52d9f5de2'
# 图片文件
filename = filename
# 验证码类型,# 例:1004表示4位字母数字,不同类型收费不同。请准确填写,否则影响识别率。在此查询所有类型 http://www.yundama.com/price.html
codetype = 1005
# 超时时间,秒
timeout = 60
# 检查
if (username == 'username'):
print('请设置好相关参数再测试')
else:
# 初始化
yundama = YDMHttp(username, password, appid, appkey)
# 登陆云打码
uid = yundama.login();
# print('uid: %s' % uid)
# 查询余额
balance = yundama.balance();
# print('balance: %s' % balance)
# 开始识别,图片路径,验证码类型ID,超时时间(秒),识别结果
cid, result = yundama.decode(filename, codetype, timeout);
# print('cid: %s, result: %s' % (cid, result))
return result
######################################################################
if __name__ == '__main__':
img = 'yzm1.jpg'
code = get_code(img)
print(code)
# 使用
import requests
from fake_useragent import UserAgent
from 爬虫学习.ydm_util import get_code
def get_image():
img_url = 'http://www.yundama.com/index/captcha'
response = session.get(img_url, headers=headers)
with open('yzm.jpg', 'wb')as f:
f.write(response.content)
code = get_code('yzm.jpg')
print(code)
return code
def do_login(code):
login_url = 'http://www.yundama.com/index/login?'
f_data = {
'username': 'zq666_yh',
'password': 'ZQZ981004',
'utype': '1',
'vcode': code
}
response = session.get(login_url, headers=headers, params=f_data)
print(response.text)
# 三个操作必须在同一个会话下进行
if __name__ == '__main__':
session = requests.Session()
index_url = 'http://www.yundama.com/'
headers = {
'User-Agent': UserAgent().random
}
response = session.get(index_url, headers=headers)
code = get_image()
do_login(code)
17. selenium的使用
selenium主要你用于打开浏览器测试,并控制浏览器进行一些操作
需要现在python安装目录下的script中放置浏览器驱动:如chromedriver.exe
from selenium import webdriver
chrome = webdriver.Chrome()
chrome.get('http://www.zengqiang.club')
# chrome.save_screenshot('zqclub.jpg')
# html = chrome.page_source
# print(html)
id_content = chrome.find_element_by_id('run_time').text
print(id_content)
chrome.find_element_by_name('query').send_keys('爬虫')
chrome.find_element_by_class_name('search').click()
chrome.save_screenshot('爬虫.jpg')
print(chrome.current_url)
# 关闭当前页
chrome.close()
# 获取当前页面Cookie
print(chrome.get_cookies())
chrome.quit()
18. 练习-爬取360电影信息
练习使用四种爬取方式xpath,re,bs4,pyquery
import requests
from fake_useragent import UserAgent
from lxml import html
from random import randint
from time import sleep
from bs4 import BeautifulSoup
import re
from pyquery import PyQuery
# 获取页面的html代码
def get_html(url):
headers = {
'User-Agent': UserAgent().random
}
# 暂停2-6秒,使更像人的操作
sleep(randint(2, 6))
response = requests.get(url, headers=headers)
response.encoding = 'utf-8'
if response.status_code == 200:
return response.text
else:
return None
# 解析首页(得到电影信息的url)
def parse_index(index_html):
# ------pq--------
doc = PyQuery(index_html)
moive_a = doc('ul.list.g-clear a')
moive_urls = []
for a in moive_a:
moive_urls.append(a.attrib['href'])
# ------re--------
# moive_urls = re.findall(r'<a class="js-tongjic" href="(.+)">', index_html)
# ------bs4--------
# soup = BeautifulSoup(index_html, 'lxml')
# moive_a = soup.select('ul.list.g-clear a')
# # print(moive_a)
# moive_urls = []
# for a in moive_a:
# moive_urls.append(a['href'])
# ------xpath--------
# etree = html.etree
# e = etree.HTML(index_html)
# moive_urls = e.xpath('//ul[@class="list g-clear"]//a/@href')
return ['https://www.360kan.com{}'.format(url) for url in moive_urls]
# 解析电影信息,得到需要的内容
def parse_info(movie_html):
# ------pq--------
doc = PyQuery(movie_html)
name = doc('h1').text()
types = doc('p.item > a.cat').text()
actors = doc('p.item.item-actor > a').text()
# ------re--------
# name = re.findall(r'<h1>(.+)</h1>', movie_html)[0]
# types = re.findall(r'class="cat.+href.+">(.+)</', movie_html)
# actors = re.findall(r'<a class="name" href=".+">(.+)</a>', movie_html)
# ------bs4--------
# soup = BeautifulSoup(movie_html, 'lxml')
# name = soup.select('h1')[0].text
# type = soup.select('p.item')[0].select('a')
# types = []
# for t in type:
# types.append(t.text)
# actor = soup.select('p.item.item-actor')[0].select('a')
# actors = []
# for a in actor:
# actors.append(a.text)
# ------xpath--------
# etree = html.etree
# e = etree.HTML(movie_html)
# name = e.xpath('//h1/text()')[0]
# types = e.xpath('//p[@class="item"][1]/a/text()')
# actors = e.xpath('//p[@class="item item-actor"]/a/text()')
return {
'name': name,
'types': types,
'actors': actors
}
# 主方法,遍历电影url,打印爬取的数据
def main():
index_url = 'https://www.360kan.com/dianying/list.php?year=all&area=all&act=all&cat=all'
index_html = get_html(index_url)
moive_urls = parse_index(index_html)
print(moive_urls)
for url in moive_urls:
moive_html = get_html(url)
moive = parse_info(moive_html)
print(moive)
if __name__ == '__main__':
main()
19. 练习-爬取虎牙直播正在直播的主播信息
使用selenium爬取
from selenium import webdriver
from time import sleep
driver = webdriver.Chrome()
url = 'https://www.huya.com/g/2356'
driver.get(url)
num = 1
while True:
print('第', str(num), '页------------')
num += 1
sleep(5)
html = driver.page_source
titles = driver.find_elements_by_xpath('//a[@class="title new-clickstat j_live-card"]')
anthors = driver.find_elements_by_xpath('//i[@class="nick"]')
audiences = driver.find_elements_by_xpath('//i[@class="js-num"]')
for title, anthor, audience in zip(titles, anthors, audiences):
print(title.text, '---', anthor.text, '---', audience.text)
if html.find('laypage_next') != -1:
driver.find_element_by_xpath('//a[@class="laypage_next"]').click()
else:
break
20. selenium滚动条的使用
有些网页需要滚动才会显示所有内容
爬取京东商品信息
from selenium import webdriver
from time import sleep
url = 'https://search.jd.com/Search?keyword=iqoo&enc=utf-8&pvid=1c71f2514c724500b5c4e7f4dc58c1f2'
driver = webdriver.Chrome()
driver.get(url)
js = 'document.documentElement.scrollTop=100000'
driver.execute_script(js)
sleep(3)
html = driver.page_source
names = driver.find_elements_by_xpath('//div[@class="gl-i-wrap"]//a/em')
prices = driver.find_elements_by_xpath('//div[@class="gl-i-wrap"]//strong/i')
print(len(names))
for name, price in zip(names, prices):
print(name.text, ':', price.text)
21. 练习-图虫网首页所有组图的爬取
import requests
from fake_useragent import UserAgent
from lxml import html
from selenium import webdriver
def get_group_urls():
driver = webdriver.Chrome()
index_url = 'https://tuchong.com/'
driver.get(index_url)
index_html = driver.page_source
# print(index_html)
etree = html.etree
e = etree.HTML(index_html)
group_urls = e.xpath('//div[@class="post-item"]/a[1]/@href')
return group_urls
def get_group_html(group_urls):
etree = html.etree
headers = {'User_Agent': UserAgent().random}
group_num = 1
for url in group_urls:
group_name = 'group' + str(group_num)
group_num += 1
response = requests.get(url, headers=headers)
e = etree.HTML(response.text)
# print(response.text)
img_urls = e.xpath('//article[@class="post-content"]//img[@class="multi-photo-image"]/@src')
print(img_urls)
for img_url in img_urls:
img_name = img_url[img_url.rfind('/') + 1:]
save_img(img_url, group_name, img_name)
def save_img(img_url, group_num, img_name):
headers = {'User_Agent': UserAgent().random}
response = requests.get(img_url, headers=headers)
group_name = 'group' + str(group_num)
with open('img/' + group_name + '-' + img_name, 'wb') as f:
f.write(response.content)
def main():
group_urls = get_group_urls()
get_group_html(group_urls)
if __name__ == '__main__':
main()
22. 双色球案例(存入数据到数据库)
链接数据库
并插入爬取的数据
import requests
from fake_useragent import UserAgent
from lxml import html
import pymysql
def get_html(url):
headers = {
'User-Agent': UserAgent().random
}
response = requests.get(url, headers=headers)
return response.text
def save_mysql(trs, date_time):
client = pymysql.connect(host='localhost', port=3306, user='root', password='ZQZ981004', charset='utf8',
db='python')
print('数据库链接成功')
cursor = client.cursor()
sql = 'insert into double_ball values(0,%s,%s,%s)'
for tr, time in zip(trs, date_time):
# 提取红球
red_ball = '-'.join(tr.xpath('./td[@class="chartBall01"]/text()'))
# 提取蓝球
blue_ball = tr.xpath('./td[@class="chartBall02"]/text()')[0]
print("第 " + time + "期—红球是:" + red_ball + " 蓝球:" + blue_ball)
cursor.execute(sql, [time, red_ball, blue_ball])
client.commit()
cursor.close()
client.close()
print('保存数据完成')
def main():
url = 'https://datachart.500.com/ssq/'
html_ = get_html(url)
etree = html.etree
e = etree.HTML(html_)
data_time = e.xpath('//tbody[@id="tdata"]/tr/td[@align="center"]/text()')
trs = e.xpath('//tbody[@id="tdata"]/tr[not(@class)]')
save_mysql(trs, data_time)
if __name__ == '__main__':
main()
23. 爬虫新写法(规范,类)
分离所有的方法
使用类
更专业
import requests
from fake_useragent import UserAgent
from lxml import etree
# url管理
class URLManager(object):
def __init__(self):
self.new_url = []
self.old_url = []
# 获取一个url
def get_new_url(self):
url = self.new_url.pop()
self.old_url.append(url)
return url
# 增加一个url
def add_new_url(self, url):
if url not in self.new_url and url and url not in self.old_url:
self.new_url.append(url)
# 增加多个url
def add_new_urls(self, urls):
for url in urls:
self.add_new_url(url)
# 判断是否还有可以爬取的url
def has_new_url(self):
return self.get_new_url_size() > 0
# 获取可以爬取的数量
def get_new_url_size(self):
return len(self.new_url)
# 获取已经爬取的数量
def get_old_url_size(self):
return len(self.old_url)
# 爬取
class Downloader:
def download(self, url):
response = requests.get(url, headers={"User-Agent": UserAgent().random})
if response.status_code == 200:
response.encoding = 'utf-8'
return response.text
else:
return None
# 解析
class Parser:
def parse(self, html):
e = etree.HTML(html)
datas = self.parse_info(e)
#datas = [span.xpath('string(.)') for span in e.xpath('//div[@class="content"]/span[1]')]
urls = self.parse_urls(e)
#urls = [ 'https://www.qiushibaike.com{}'.format(url) for url in e.xpath('//ul[@class="pagination"]/li/a/@href')]
return datas, urls
def parse_info(self, e):
spans = e.xpath('//div[@class="content"]/span[1]')
datas = []
for span in spans:
datas.append(span.xpath('string(.)'))
return datas
def parse_urls(self, e):
base_url = 'https://www.qiushibaike.com{}'
urls = []
for url in e.xpath('//ul[@class="pagination"]/li/a/@href'):
urls.append(base_url.format(url))
return urls
# 数据处理
class DataOutPut:
def save(self, datas):
with open('duanzi.txt', 'a', encoding='utf-8') as f:
for data in datas:
f.write(data)
# 调度
class DiaoDu:
def __init__(self):
self.downloader = Downloader()
self.url_manager = URLManager()
self.parser = Parser()
self.data_saver = DataOutPut()
def run(self, url):
self.url_manager.add_new_url(url)
while self.url_manager.has_new_url():
url = self.url_manager.get_new_url()
html = self.downloader.download(url)
data, urls = self.parser.parse(html)
self.data_saver.save(data)
self.url_manager.add_new_urls(urls)
if __name__ == '__main__':
diao_du = DiaoDu()
diao_du.run('https://www.qiushibaike.com/text/page/1/')