因工作内容需要,尝试爬取所有海南的A级景区,因之前通过BeautifulSoup爬取过一些网站,这次仍旧选了该组件,不过爬取的结果有点不如人意,景区数量和实际不匹配,尽管如此,还是把代码帖一下吧。

Python做网络爬虫和网页内容解析是比较专业的,提供了很多专业的爬虫框架或组件,内容解析也有多种组件支持,各有各的利弊,在此不做赘述。

爬虫的难点有三点:

1、针对自己的目标,爬取合适的网站

2、爬取网站的入口,及url的入口和归纳

3、网站内容的解析,需要了解网页的格式

当然更多的难度在于系统验证和反爬手段,本次爬虫不涉及系统验证和反爬,也不多做详述了。

代码示例

  1. #"""

  2. Created on 2019-02-26

  3. @author: Administrator

  4. """

  5. import urllib.request

  6. import csv

  7. import time

  8. from bs4 import BeautifulSoup

  9. #获取景点的url列表

  10. def getsceneryurl():

  11. #http://www.bytravel.cn/Scene/1a.html

  12. #http://www.bytravel.cn/Scene/1a5.html

  13. #http://www.bytravel.cn/Scene/2a.html

  14. #http://www.bytravel.cn/Scene/2a107.html

  15. #http://www.bytravel.cn/Scene/3a.html

  16. #http://www.bytravel.cn/Scene/3a268.html

  17. #http://www.bytravel.cn/Scene/4a.html

  18. #http://www.bytravel.cn/Scene/4a289.html

  19. #http://www.bytravel.cn/Scene/5a.html

  20. #http://www.bytravel.cn/Scene/5a33.html

  21. #参照上面的url规则定义

  22. dict = {'1a':5,'2a':107,'3a':268,'4a':289,'5a':33}

  23. sceneryurllist = []

  24. for key, value in dict.items():

  25. level=key

  26. num=value+1

  27. for i in range(num):

  28. if i==0:

  29. sceneryurllist.append("http://www.bytravel.cn/Scene/" + level+ '.html')

  30. else:

  31. sceneryurllist.append("http://www.bytravel.cn/Scene/" + level + str(i)+'.html')

  32. return sceneryurllist

  33. #解析url列表中的景区

  34. def getscenerydetaillist(urllist):

  35. scenerydetailinfolist = []

  36. i=0

  37. #获取当前url中的内容及景区

  38. for cururl in urllist:

  39. #测试1url10个景点

  40. #if cururl!='http://www.bytravel.cn/Scene/1a.html':continue

  41. #如果爬虫受限制,修改爬虫时间间隔

  42. #time.sleep(1)

  43. htmlscenerylist = urllib.request.urlopen(cururl).read()

  44. xmlscenerylist = BeautifulSoup(htmlscenerylist, 'lxml')

  45. #根据特征标签解析每一个景区

  46. for curscenry in xmlscenerylist.find_all(attrs={'style':r'margin:0 3px 0 3px;padding:2px 0 2px 0'}):

  47. # 了解每个景点的HTML标签主要情况,以便进行解析

  48. # curscenry=

  49. # <div id="tctitletop10">3

  50. # <a class="f14b" href="/Landscape/34/huafoshan.html" target="_blank">化佛山</a>

  51. # <span style="padding:0 0 0 5px">

  52. # <font class="f14" color="red">A</font>

  53. # </span>

  54. # [

  55. # <a href="/view/index133.html" target="_blank">云南省</a>

  56. # <a href="/view/index507.html" target="_blank">楚雄州</a>

  57. # <a href="/view/index1956.html" target="_blank">牟定县</a>

  58. # ]

  59. # </div>

  60. i+=1

  61. scenery={}

  62. scenery['No']=i

  63. scenery['name'] = curscenry.findAll(class_="f14b")[0].get_text()

  64. scenery['star'] = curscenry.findAll(class_="f14")[0].get_text()

  65. scenery['areaname'] = []

  66. #解析景区地址,景区地址不固定

  67. for area in curscenry.select('a[href^="/view/index"]'):

  68. scenery['areaname'].append(area.get_text())

  69. scenery['content'] = curscenry.findAll(attrs={'style':r'margin:0px 10px 0px 10px;'})[0].get_text().strip().replace(' ', '')

  70. scenerydetailinfolist.append(scenery)

  71. return scenerydetailinfolist

  72. def savetocsv(scenerydetailinfolist):

  73. #列表中数组的结构

  74. #{'No': 921, 'name': '人民红园', 'star': 'AA', 'areaname': ['甘肃省', '临夏州', '临夏市'],'content': '人民红园位于。整个…[详细]'}

  75. #{'No': 922, 'name': '平阳寺', 'star': 'AA', 'areaname': ['浙江省', '绍兴市', '柯桥区'],'content': '位于浙江省绍兴县…[详细]'}

  76. pass

  77. with open("jingqu.csv", "w", newline="", encoding='utf-8') as datacsv:

  78. csvwriter = csv.writer(datacsv, delimiter='^',dialect=("excel"))

  79. for scenery in scenerydetailinfolist:

  80. sceneryno=scenery['No']

  81. sceneryname =scenery['name'].encode('utf-8').decode('utf-8', 'ignore')

  82. scenerystar =scenery['star']

  83. sceneryareaname=','.join(scenery['areaname']).encode('utf-8').decode('utf-8', 'ignore')

  84. scenerycontent =scenery['content'].encode('utf-8').decode('utf-8', 'ignore')

  85. csvwriter.writerow([sceneryno,sceneryname,scenerystar,sceneryareaname,scenerycontent])

  86. sceneryurllist =getsceneryurl()

  87. scenerydetailinfolist=getscenerydetaillist(sceneryurllist)

  88. savetocsv(scenerydetailinfolist)