Python爬取新浪新闻数据写入Excel

原创

wx62d1485ecb778 2022-07-20 10:45:04 ©著作权

文章标签 python html 数据主函数 文章分类 Python 后端开发

©著作权归作者所有：来自51CTO博客作者wx62d1485ecb778的原创作品，请联系作者获取转载授权，否则将追究法律责任

先爬取最新消息列表，再循环爬取对应url中的新闻详细数据

Python爬取新浪新闻数据写入Excel_python

# -*- coding: utf-8 -*-
"""
Spyder Editor

news.py.
"""


import requests
from bs4 import BeautifulSoup
from datetime import datetime
import json
import xlwt

def get_list(url):
    
    # 新闻链接
    res=requests.get(url)
    res.encoding='utf-8'
    
    # 完整HTML
    html=BeautifulSoup(res.text,'html.parser')
    
    # 新闻列表
    newList=[]
    
    for item in html.select('.news-item'):
        try:
            newObj={}
            newObj['title']=item.select('h2 a')[0].text
            newObj['url']=item.select('h2 a')[0].get('href')
            newList.append(newObj)
        except:
            print('出现异常')
    return newList
    

def get_detail(url):

    # 新闻链接
    res=requests.get(url)
    res.encoding='utf-8'
    
    # 完整HTML
    html=BeautifulSoup(res.text,'html.parser')
    
    # 新闻对象
    result={}
    
    # 新闻标题
    result['title']=html.select('.main-title')[0].text
    
    # 发布时间
    timesource=html.select('.date-source span')[0].text
    createtime=datetime.strptime(timesource,'%Y年%m月%d日 %H:%M')
    createtime.strftime('%Y-%m-%d')
    result['createtime']=createtime
    
    # 新闻来源
    result['place']=html.select('.date-source a')[0].text
    
    # 新闻内容
    article=[]
    for p in html.select('#article p')[:-1]:
        article.append(p.text.strip())
    articleText=' '.join(article)
    result['article']=articleText
    
    # 新闻作者
    result['author']=html.select('.show_author')[0].text.strip('责任编辑：')
    
    # 新闻链接
    result['url']=url
    
    return result




if __name__ == "__main__":          #主函数

    newList=get_list('https://news.sina.com.cn/world/')
    # print(newList)
    
    # newObj=get_detail('http://news.sina.com.cn/c/2020-10-19/doc-iiznctkc6335371.shtml')
    # print(newObj)
    
    book = xlwt.Workbook(encoding='utf-8')
    sheet = book.add_sheet('ke_qq')
    head = ['标题','时间','作者','链接','来源','内容'] #表头
    for h in range(len(head)):
        sheet.write(0,h,head[h]) #写入表头
  
    for i,item in enumerate(newList):
        try:
            newObj=get_detail(item['url'])
            sheet.write(i+1,0,newObj['title'])
            sheet.write(i+1,1,newObj['createtime'])
            sheet.write(i+1,2,newObj['author'])
            sheet.write(i+1,3,newObj['url'])
            sheet.write(i+1,4,newObj['place'])
            sheet.write(i+1,5,newObj['article'])
            print (str(i),'写入成功')
        except:
            print (str(i),'出现异常')
            
    book.save('F:\ke.xls')