刚学了爬虫基础,拿来练手。
爬取湖区科比相关新闻(蜗壳虽然退役很久了,还是偶尔有点新闻的),下载图片和新闻文本。
xpath,Beautifulsoup,正则表达式都用到了一点。写的比较糙。
# 爬取虎扑湖人专区关于科比的新闻
# 下载内容和图片
import requests
from lxml import etree
import re
import csv
from bs4 import BeautifulSoup
def csv_writer(item):
with open('kobe.csv', 'a', encoding='utf-8', newline='') as csvfile:
writer = csv.writer(csvfile)
try:
writer.writerow(item)
except Exception as e:
print('保存错误:', e)
print('正在爬取:', item[0])
def spider(url_):
# 返回网址源代码文本
response = requests.get(url, headers=headers)
return etree.HTML(response.text)
def parse_detail(detail_url):
page = requests.get(detail_url, headers=headers)
content = page.text
soup = BeautifulSoup(content, 'lxml')
bodys = soup.find_all(class_="artical-main-content")
body = BeautifulSoup(str(bodys[0]), 'lxml')
tips = body.find_all('p')
main_content=""
for tip in tips:
if "img" not in str(tip) and "href" not in str(tip):
tip = re.findall(r'<p>(.*)?</p>', str(tip))
main_content+='\t'+str(tip[0])+'\n'
return main_content
def download_image(image_url, title):
img = requests.get(image_url, headers=headers).content
with open(title + '.png', 'wb') as f:
f.write(img)
def parse(list_url):
sel = spider(list_url)
all_article = sel.xpath('//*[@class="list-content"]')
for article in all_article:
title = article.xpath('div[1]/span/a/text()')[0]
if re.match('科比', title):
detail_url = article.xpath('div[1]/span/a/@href')[0]
content = parse_detail(detail_url)
csv_writer([title, content])
image_url = article.xpath('div[2]/div/a/img/@src')[0]
download_image(image_url, title)
com_url = 'https://voice.hupu.com/nba/tag/846-'
all_url = [com_url + str(x) + '.html' for x in range(1, 43) ]
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'}
for url in all_url:
parse(url)