URL很简单,数据集分散开在一个URL页面上,单个用手下载很慢,这样可以用python辅助下载;
问题:很多国外的数据集,收到网络波动的影响很大,最好可以添加一个如果失败就继续请求的逻辑,这里还没有实现;
代码都是这位大神的,感谢,我再上面稍微改了一点点,加了异常处理。
'''
downloading dataset on one html page
'''
import requests
from bs4 import BeautifulSoup
archive_url = your_target_url
def get_target_links():
r = requests.get(archive_url)
soup = BeautifulSoup(r.content, 'html5lib')
links = soup.findAll('a')
video_links = []
#video_links = [archive_url + link['href'] forlink in links if (link['href'].endswith('atr') or link['href'].endswith('dat') or link['href'].endswith('hea') )]
for link in links:
try:
if((link['href'].endswith('atr') or link['href'].endswith('dat') or link['href'].endswith('hea') )):
video_links.append(archive_url + link['href'])
except KeyError:
print('keyerror, keep going!')
for i in video_links:
print(i, '\n')
return video_links
def download_target_series(video_links):
failed_list = []
for link in video_links:
file_name = link.split('/')[-1]
file_name = ‘your_local_folder’ + file_name
print("Downloading file:%s" % file_name)
print(link)
try:
r = requests.get(link, stream=True)
except Exception:
failed_list.append(file_name.split('\\')[-1])
print('download failed. go to down next one\n')
# download started
with open(file_name, 'wb') as f:
for chunk in r.iter_content(chunk_size=1024 * 1024):
if chunk:
f.write(chunk)
print("%s downloaded!\n" % file_name)
print("All videos downloaded!")
print(failed_list) #record which one is failed to download
return
if __name__ == "__main__":
target_links = get_target_links()
download_target_series(target_links)