因工作内容需要,尝试爬取所有海南的A级景区,因之前通过BeautifulSoup爬取过一些网站,这次仍旧选了该组件,不过爬取的结果有点不如人意,景区数量和实际不匹配,尽管如此,还是把代码帖一下吧。
Python做网络爬虫和网页内容解析是比较专业的,提供了很多专业的爬虫框架或组件,内容解析也有多种组件支持,各有各的利弊,在此不做赘述。
爬虫的难点有三点:
1、针对自己的目标,爬取合适的网站
2、爬取网站的入口,及url的入口和归纳
3、网站内容的解析,需要了解网页的格式
当然更多的难度在于系统验证和反爬手段,本次爬虫不涉及系统验证和反爬,也不多做详述了。
代码示例
#"""
Created on 2019-02-26
@author: Administrator
"""
import urllib.request
import csv
import time
from bs4 import BeautifulSoup
#获取景点的url列表
def getsceneryurl():
#http://www.bytravel.cn/Scene/1a.html
#http://www.bytravel.cn/Scene/1a5.html
#http://www.bytravel.cn/Scene/2a.html
#http://www.bytravel.cn/Scene/2a107.html
#http://www.bytravel.cn/Scene/3a.html
#http://www.bytravel.cn/Scene/3a268.html
#http://www.bytravel.cn/Scene/4a.html
#http://www.bytravel.cn/Scene/4a289.html
#http://www.bytravel.cn/Scene/5a.html
#http://www.bytravel.cn/Scene/5a33.html
#参照上面的url规则定义
dict = {'1a':5,'2a':107,'3a':268,'4a':289,'5a':33}
sceneryurllist = []
for key, value in dict.items():
level=key
num=value+1
for i in range(num):
if i==0:
sceneryurllist.append("http://www.bytravel.cn/Scene/" + level+ '.html')
else:
sceneryurllist.append("http://www.bytravel.cn/Scene/" + level + str(i)+'.html')
return sceneryurllist
#解析url列表中的景区
def getscenerydetaillist(urllist):
scenerydetailinfolist = []
i=0
#获取当前url中的内容及景区
for cururl in urllist:
#测试1条url的10个景点
#if cururl!='http://www.bytravel.cn/Scene/1a.html':continue
#如果爬虫受限制,修改爬虫时间间隔
#time.sleep(1)
htmlscenerylist = urllib.request.urlopen(cururl).read()
xmlscenerylist = BeautifulSoup(htmlscenerylist, 'lxml')
#根据特征标签解析每一个景区
for curscenry in xmlscenerylist.find_all(attrs={'style':r'margin:0 3px 0 3px;padding:2px 0 2px 0'}):
# 了解每个景点的HTML标签主要情况,以便进行解析
# curscenry=
# <div id="tctitletop10">3、
# <a class="f14b" href="/Landscape/34/huafoshan.html" target="_blank">化佛山</a>
# <span style="padding:0 0 0 5px">
# <font class="f14" color="red">A</font>
# </span>
# [
# <a href="/view/index133.html" target="_blank">云南省</a>
# <a href="/view/index507.html" target="_blank">楚雄州</a>
# <a href="/view/index1956.html" target="_blank">牟定县</a>
# ]
# </div>
i+=1
scenery={}
scenery['No']=i
scenery['name'] = curscenry.findAll(class_="f14b")[0].get_text()
scenery['star'] = curscenry.findAll(class_="f14")[0].get_text()
scenery['areaname'] = []
#解析景区地址,景区地址不固定
for area in curscenry.select('a[href^="/view/index"]'):
scenery['areaname'].append(area.get_text())
scenery['content'] = curscenry.findAll(attrs={'style':r'margin:0px 10px 0px 10px;'})[0].get_text().strip().replace(' ', '')
scenerydetailinfolist.append(scenery)
return scenerydetailinfolist
def savetocsv(scenerydetailinfolist):
#列表中数组的结构
#{'No': 921, 'name': '人民红园', 'star': 'AA', 'areaname': ['甘肃省', '临夏州', '临夏市'],'content': '人民红园位于。整个…[详细]'}
#{'No': 922, 'name': '平阳寺', 'star': 'AA', 'areaname': ['浙江省', '绍兴市', '柯桥区'],'content': '位于浙江省绍兴县…[详细]'}
pass
with open("jingqu.csv", "w", newline="", encoding='utf-8') as datacsv:
csvwriter = csv.writer(datacsv, delimiter='^',dialect=("excel"))
for scenery in scenerydetailinfolist:
sceneryno=scenery['No']
sceneryname =scenery['name'].encode('utf-8').decode('utf-8', 'ignore')
scenerystar =scenery['star']
sceneryareaname=','.join(scenery['areaname']).encode('utf-8').decode('utf-8', 'ignore')
scenerycontent =scenery['content'].encode('utf-8').decode('utf-8', 'ignore')
csvwriter.writerow([sceneryno,sceneryname,scenerystar,sceneryareaname,scenerycontent])
sceneryurllist =getsceneryurl()
scenerydetailinfolist=getscenerydetaillist(sceneryurllist)
savetocsv(scenerydetailinfolist)