当Google创始人用python写下他们第一个简陋的爬虫, 运行在同样简陋的服务器上的时候 ;
很少有人能够想象 , 在接下的数十年间 , 他们是怎样地颠覆了互联网乃至于人类的世界 。
有网络的地方就有爬虫,爬虫英文名称spider。它是用来抓取网站数据的程序。比如: 我们通过一段程序,定期去抓取类似百度糯米、大众点评上的数据,将这些信息存储到数据库里,然后加上展示页面,一个团购导航站就问世了。毫无疑问,爬虫是很多网站的初期数据来源。
一、第一个爬虫功能的实现
——查看博文目录第一篇文章的URL
首先需要引入urllib模块,使用find函数查找url,经过字符处理就都得到了需要的URL。
#!/usr/bin/env python import urllib url = ['']*40 i = 0 con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read() title = con.find(r'<a title=') href = con.find(r'href=',title) html = con.find(r'.html',href) url = con[href +6 :html +5 ] print url
二、查看博文目录第一页所有文章的URL
A:
#!/usr/bin/env python import urllib url = ['']*40 i = 0 con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read() title = con.find(r'<a title=') href = con.find(r'href=',title) html = con.find(r'.html',href) url[0] = con[href +6 :html +5 ] print url while title != -1 and href != -1 and html != -1 and i < 40: url[i] = con[href +6 :html +5 ] print url[i] title = con.find(r'<a title=',html) href = con.find(r'href=',title) html = con.find(r'.html',href) i = i +1
或者B:
#!/usr/bin/env python import urllib i = 0 con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_1.html').read() title = con.find(r'<a title=') href = con.find(r'href=',title) html = con.find(r'.html',href) url = con[href +6 :html +5 ] while title != -1 and href != -1 and html != -1 and i < 50: title = con.find(r'<a title=',html) href = con.find(r'href=',title) html = con.find(r'.html',href) url = con[href +6 :html +5 ] print url i = i + 1
三、下载博文目录第一页所有的文章
A:
#!/usr/bin/env python import urllib i = 0 url = ['']*40 con = urllib.urlopen('http://www.zhihu.com/collection/19668036').read() target = con.find(r'<a target="_blank') base = con.find(r'href=',target) end = con.find('>',base) url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1] print url[0] while i < 20: url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1] print url[0] target = con.find(r'<a target="_blank',end) base = con.find(r'href=',target) end = con.find('>',base) i = i + 1 while j < 30: content = urllib.urlopen(url[j]).read() print url[0] open(r'zhihu/'+url[j],'w+').write(content) print 'downloading', j = j + 1 time.sleep(15)
或者B:
#!/usr/bin/env python import time import urllib i = 0 j = 0 url = ['']*30 name = ['']*30 con = urllib.urlopen('http://www.zhihu.com/collection/19668036').read() target = con.find(r'<a target="_blank') base = con.find(r'href=',target) end = con.find('>',base) url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1] while target != -1 and base != -1 and end != -1 and i < 30: url[0] = 'http://www.zhihu.com' + con[target +25 :end - 1] name[0] = con[base +16 :end - 1] target = con.find(r'<a target="_blank',end) base = con.find(r'href=',target) end = con.find('>',base) content = urllib.urlopen(url[0]).read() open(r'zhihu/'+name[0]+'.html','w+').write(content) print 'downloading',name[0] time.sleep(5) i = i + 1
四、下载所有文章
A:
import time import urllib page = 1 url = ['']*350 i = 0 link = 1 while page <= 7: con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read() title = con.find(r'<a title=') href = con.find(r'href=',title) html = con.find(r'.html',href) while title != -1 and href != -1 and html != -1 and i < 350: url[i] = con[href +6 :html +5 ] print link,url[i] title = con.find(r'<a title=',html) href = con.find(r'href=',title) html = con.find(r'.html',href) link = link + 1 i = i +1 else: print 'find end!' page = page + 1 else: print 'all find end' j = 0 while j < 50: content = urllib.urlopen(url[j]).read() open(r'tmp/'+url[j][-26:],'w+').write(content) j = j + 1 time.sleep(5) else: print 'Download over!'
B:
#!/usr/bin/env python import time import urllib i = 0 link = 1 page = 1 url = ['']*350 while page <= 7: con = urllib.urlopen('http://blog.sina.com.cn/s/articlelist_1191258123_0_'+str(page)+'.html').read() title = con.find(r'<a title=') href = con.find(r'href=',title) html = con.find(r'.html',href) while title != -1 and href != -1 and html != -1 and i < 350: url[i] = con[href +6 :html +5 ] print link,url[i] title = con.find(r'<a title=',html) href = con.find(r'href=',title) html = con.find(r'.html',href) content = urllib.urlopen(url[i]).read() open(r'/tmp/sina/'+url[i][-26:],'w+').write(content) time.sleep(5) link = link + 1 i = i +1 page = page + 1 else: print 'Download Over!'
运行结果: