本文目录:
- 1.目标
- 2.实现
- 参考资料:
1.目标
本文目标是自动解析头条的视频新闻,通过第三方解析网站得到其真实的下载地址并自动下载到本地
视频的播放地址是这样的:
所以我们实际上只要从视频列表页面解析得到视频列表的
/item/视频id编号
,然后通过selenium 驱动浏览自动输入到上面的解析网站,获取解析结果即可。
OK,思路有了,下面开搞~
2.实现
获取到一系列的头条视频内部地址后,通过浏览器模拟输入内部地址解析得到真实的下载地址。
xxxxxxxxxx
def getRealPalyUrl(self, media_url, id, title, author):
# 查找视频地址输入框,自动输入内容
input_els = self.browser.find_element_by_xpath('//div[contains(@class, "input-group")]/input[contains(@placeholder, "请输入视频地址")][1]')
input_els.send_keys('http://www.toutiao.com' + media_url)
parse_btn = self.browser.find_element_by_xpath('//div[contains(@class, "input-group")]/div/button[contains(@class, "btn")][@type="button"][1]')
parse_btn.click()
try:
videoInfo = WebDriverWait(self.browser, 10).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="thumbnail"]/div[@class="caption"]/p[1]/a'))
)
page = self.browser.page_source
page_etree = etree.HTML(page)
video_a = page_etree.xpath('//div[@class="thumbnail"]/div[@class="caption"]/p[1]/a[last()]')
if video_a and len(video_a) > 0:
video_a = video_a[0]
# 得到下载地址,视频清晰度描述
download_url = video_a.xpath('./@href')[0]
desc = ''
video_desc = video_a.xpath('./text()')
if video_desc and len(video_desc) > 0 and ('视频下载' in video_desc[0]):
desc = str(video_desc[0]).replace('视频下载', '')
# 保存到数据库
updateVideoInfo2DB(id, download_url, desc)
# 下载到本地
dl = DownloadFile()
dl.download(download_url, title, author)
except Exception as ex:
print(ex)
函数getRealPalyUrl(self, media_url, id, title, author):
的media_url
就是前面说的头条内部视频地址比如:/item/6606468202769678855/
,
input_els = self.browser.find_element_by_xpath('//div[contains(@class, "input-group")]/input[contains(@placeholder, "请输入视频地址")][1]')
input_els.send_keys('http://www.toutiao.com' + media_url)
上面第一行是为了找到“请输入视频地址
”这个输入框,第二行是模拟键盘输入完整的地址内容。
parse_btn = self.browser.find_element_by_xpath('//div[contains(@class, "input-group")]/div/button[contains(@class, "btn")][@type="button"][1]')
parse_btn.click()
videoInfo = WebDriverWait(self.browser, 10).until(
EC.presence_of_element_located((By.XPATH, '//div[@class="thumbnail"]/div[@class="caption"]/p[1]/a'))
)
接下来这个代码是在点击解析视频按钮之后等待页面出现下载地址再进行下一步,这里是最多等待10s,一般情况下都足够了。后面就是解析得到具体的downloadurl了,然后通过这个真实的url下载到本地。其中用到的下载类DownloadFile的代码如下:
#!/usr/bin/python3
# -*- coding:utf-8 -*-
import os
import sys
import time
from urllib import request
class DownloadFile(object):
def __init__(self):
self.start_time = time.time()
'''
urllib.urlretrieve 的回调函数:
def callbackfunc(blocknum, blocksize, totalsize):
@blocknum: 已经下载的数据块
@blocksize: 数据块的大小
@totalsize: 远程文件的大小
'''
def __Schedule(self, blocknum, blocksize, totalsize):
speed = (blocknum * blocksize) / (time.time() -self.start_time)
# speed_str = " Speed: %.2f" % speed
speed_str = " Speed: %s" % self.__format_size(speed)
recv_size = blocknum * blocksize
# 设置下载进度条
f = sys.stdout
pervent = recv_size / totalsize
percent_str = "%.2f%%" % (pervent * 100)
n = round(pervent * 50)
s = ('█' * n).ljust(50, '-')
f.write(percent_str.ljust(8, ' ') + '█' + s + '█' + speed_str)
f.flush()
f.write('\r')
# 字节bytes转化K\M\G
def __format_size(self, bytes):
try:
bytes = float(bytes)
kb = bytes / 1024
except:
print("传入的字节格式不对")
return "Error"
if kb >= 1024:
M = kb / 1024
if M >= 1024:
G = M / 1024
return "%.3fG" % (G)
else:
return "%.3fM" % (M)
else:
return "%.3fK" % (kb)
def __downloadFile(self, url, folder, fileName):
print("正在下载: %s" % fileName)
print(url)
request.urlretrieve(url, folder + "\\" + fileName, self.__Schedule)
def download(self, url, title, author):
curFolder = 'H:\\py\\downloads\\' + author
if not os.path.exists(curFolder):
try:
os.makedirs(curFolder)
except Exception as ex:
print(ex)
else:
try:
# 下载文件
self.__downloadFile(url, curFolder, title + '.mp4')
except Exception as ex:
print(ex)