直接上代码:
脱敏后自用的py采集代码,
#!/usr/bin/env python
# -*- coding:utf-8 -*-
"""
@author:Andy
@file:xxx.py
@time:下午05:50
@desc:采集的文章数据进博客
"""
import os
import re
import time
import requests
from bs4 import BeautifulSoup, SoupStrainer
from requests.exceptions import RequestException
from hashlib import md5
from urllib.parse import urlparse
import urllib
headers = {
'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
def get_content():
url = 'http://ask.xxxx.com/question/xxxx' # url
response = requests.get(url, headers=headers).text.replace('<i class="fa fa-paperclip"></i>', '')
soup = BeautifulSoup(response, 'lxml')
# div = soup.select('#aw-mod-body ueditor-p-reset')
pattern = re.compile('<a\shref="(http://ask.apelearn.com/file.*?)".*?>(.*?)</a>', re.S)
p = soup.find_all('a')
for item in p:
# print(str(item))
result = re.findall(pattern, str(item))
if result:
# print(result)
for i in result:
url, name = i
# print(i)
yield {
'url': url,
'name': name
}
def mkdir(path):
# 去除首位空格
path=path.strip()
# 去除尾部 \ 符号
path=path.rstrip("\\")
# 判断路径是否存在
# 存在 True
# 不存在 False
isExists=os.path.exists(path)
# 判断结果
if not isExists:
# 如果不存在则创建目录
# 创建目录操作函数
os.makedirs(path)
print(path+' 创建成功')
return True
else:
# 如果目录存在则不创建,并提示目录已存在
print(path+' 目录已存在')
return False
def getUrl(html):
#patterncss = '<link href="(.*?)"'
patternjs = '<script src="(.*?)"'
patternimg = '<img src="(.*?)"'
#href = re.compile(patterncss, re.S).findall(html)
href = re.compile(patternimg, re.S).findall(html)
href += re.compile(patternjs, re.S).findall(html)
return href
def getCssUrl(html):
patterncss = '<link href="(.*?)"'
href = re.compile(patterncss, re.S).findall(html)
return href
# 下载网页
def download_html(root_path, url):
a = urlparse(url)
file_path = a.path
file_name = os.path.basename(file_path)
_, file_suffix = os.path.splitext(file_name)
if file_suffix != '.html':
file_name_real = file_name + '.html'
else:
file_name_real = file_name
file_path_real = file_path.replace(file_name, '')
file_path_reals = file_path_real.replace('/', "\\")
all_file_path_real = root_path + file_path_reals + file_name_real
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
re = requests.get(url, headers = headers)
re.encoding = "utf-8"
itemurl = getUrl(re.text)
for item1 in itemurl:
download_commonimgjs(root_path, item1)
itemcssurl = getCssUrl(re.text)
for item2 in itemcssurl:
download_css(root_path, item2)
new_text = re.text.replace('https://www.xxxxxx.com', 'http://www.xxxxx.com')
new_texts = new_text.replace('xxxxxx.com', '3cinno.shanhubei.com')
with open(all_file_path_real, "w+", encoding="utf-8") as html_file:
html_file.write(new_texts)
def download_commonimgjs(root_path, url):
if str(url[:1]) == r"/":
imgurl = "https://www.xxxxxx.com" + url
else:
imgurl = url
a = urlparse(imgurl)
file_path = a.path
file_name = os.path.basename(file_path)
_, file_suffix = os.path.splitext(file_name)
# print(os.path.curdir(file_path))
match_url = file_path.replace(file_name, '')
match_url_new = match_url.replace('/', "\\")
newmkpath = root_path + match_url_new
if os.path.isfile(newmkpath + file_name):
return
# 调用函数
mkdir(newmkpath)
try:
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(imgurl, newmkpath + file_name)
except urllib.error.HTTPError:
print('error')
def download_img(root_path, url):
if str(url[:1]) == r"/":
imgurl = "https://www.xxxxxx.com" + url
else:
imgurl = url
a = urlparse(imgurl)
file_path = a.path
file_name = os.path.basename(file_path)
_, file_suffix = os.path.splitext(file_name)
# print(os.path.curdir(file_path))
match_url = file_path.replace(file_name, '')
match_url_new = match_url.replace('/', "\\")
newmkpath = root_path + match_url_new
# 调用函数
mkdir(newmkpath)
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent', 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(imgurl, newmkpath + file_name)
def download_js(root_path, url):
if str(url[:1]) == r"/":
imgurl = "https://www.xxxxxx.com" + url
else:
imgurl = url
a = urlparse(imgurl)
file_path = a.path
file_name = os.path.basename(file_path)
_, file_suffix = os.path.splitext(file_name)
# print(os.path.curdir(file_path))
match_url = file_path.replace(file_name, '')
match_url_new = match_url.replace('/', "\\")
newmkpath = root_path + match_url_new
# 调用函数
mkdir(newmkpath)
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(imgurl, newmkpath + file_name)
def download_css(root_path, url):
if str(url[:1]) == r"/":
imgurl = "https://www.xxxxxx.com" + url
else:
imgurl = url
a = urlparse(imgurl)
file_path = a.path
file_name = os.path.basename(file_path)
_, file_suffix = os.path.splitext(file_name)
if file_suffix != '.css':
return
# print(os.path.curdir(file_path))
match_url = file_path.replace(file_name, '')
match_url_new = match_url.replace('/', "\\")
newmkpath = root_path + match_url_new
if os.path.isfile(newmkpath + file_name):
return
# 调用函数
mkdir(newmkpath)
try:
opener = urllib.request.build_opener()
opener.addheaders = [('User-agent',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36')]
urllib.request.install_opener(opener)
urllib.request.urlretrieve(imgurl, newmkpath + file_name)
except urllib.error.HTTPError:
print('error')
def get_xml():
url = 'https://www.xxxxxx.com/sitemap-1.xml'
headers = {'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 6.1; en-US; rv:1.9.1.6) Gecko/20091201 Firefox/3.5.6'}
res = requests.get(url, headers=headers)
res.encoding = "utf-8"
# 根据你的文章链接格式写正则匹配,可能与我的不一样
r = re.compile(r'https://www.xxxxxx.com/\S*?')
big = re.findall(r, res.text)
for i in big:
print(i)
def main():
# get_content()
# url = r'https://www.xxxxxx.com/news/xxxx-proje-20711498'
url = r'https://www.xxxxxx.com/uploads/20218080/logo202107221507387902092.png'
# 定义要创建的目录
root_path = "F:\\Project-cz\\shanhubei\\3cinno"
#download_img(root_path, url)
#htmlurl = r'https://www.xxxxxx.com/3d-clear-led-dmx-ball'
#download_html(root_path, htmlurl)
cssurl = r'https://www.xxxxxx.com/images/m184/black/style.css'
#download_css(root_path, cssurl)
#demourl = 'https://www.xxxxxx.com/Content/kcim/js/layim-public.js?t=20190404'
#demo(demourl)
get_xml()
def demo(url):
a = urlparse(url)
file_path = a.path
print(a.scheme)
print(a.hostname)
print('a.file_path=' + file_path)
file_name = os.path.basename(file_path)
print('file_name=' +file_name)
_, file_suffix = os.path.splitext(file_name)
print('a.file_suffix=' + file_suffix)
if __name__ == '__main__':
main()