刚学了爬虫基础,拿来练手。
爬取湖区科比相关新闻(蜗壳虽然退役很久了,还是偶尔有点新闻的),下载图片和新闻文本。
xpath,Beautifulsoup,正则表达式都用到了一点。写的比较糙。

# 爬取虎扑湖人专区关于科比的新闻
# 下载内容和图片
import requests
from lxml import etree
import re
import csv
from bs4 import BeautifulSoup

def csv_writer(item):
    with open('kobe.csv', 'a', encoding='utf-8', newline='') as csvfile:
        writer = csv.writer(csvfile)
        try:
            writer.writerow(item)
        except Exception as e:
            print('保存错误:', e)
        print('正在爬取:', item[0])


def spider(url_):
    # 返回网址源代码文本
    response = requests.get(url, headers=headers)
    return etree.HTML(response.text)

def parse_detail(detail_url):
    page = requests.get(detail_url, headers=headers)
    content = page.text
    soup = BeautifulSoup(content, 'lxml')
    bodys = soup.find_all(class_="artical-main-content")
    body = BeautifulSoup(str(bodys[0]), 'lxml')
    tips = body.find_all('p')
    main_content=""
    for tip in tips:
        if "img" not in str(tip) and "href" not in str(tip):
            tip = re.findall(r'<p>(.*)?</p>', str(tip))
            main_content+='\t'+str(tip[0])+'\n'
    return main_content


def download_image(image_url, title):
    img = requests.get(image_url, headers=headers).content
    with open(title + '.png', 'wb') as f:
        f.write(img)


def parse(list_url):
    sel = spider(list_url)
    all_article = sel.xpath('//*[@class="list-content"]')
    for article in all_article:
        title = article.xpath('div[1]/span/a/text()')[0]
        if re.match('科比', title):
            detail_url = article.xpath('div[1]/span/a/@href')[0]
            content = parse_detail(detail_url)
            csv_writer([title, content])
            image_url = article.xpath('div[2]/div/a/img/@src')[0]
            download_image(image_url, title)


com_url = 'https://voice.hupu.com/nba/tag/846-'
all_url = [com_url + str(x) + '.html' for x in range(1, 43) ]
headers={'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.70 Safari/537.36'}
for url in all_url:
    parse(url)