博客园有自己的lucene.net搜索引擎,还有google的站内搜索,不过即使是google搜索,也不内完全索引我的内容,它也挑三捡四的,所以我一直希望做一个自己的博客的全文索引。
本来想搞一个能用的基于rake+hbase+whenever+massive_record的方案来实现可扩展,做了一半,感觉整个工程周期太长,还是放了一旁,拿起以前的代码,改吧改吧先能用起来再说
我使用的是以前15-1688小额批发搜索引擎的部分脚本,之前使用web ui的方式来定制抓取的脚本模板,这里就直接拿来用了。
整个抓取数据的过程分为4步,共4个脚本,
A生成列表页链接
B抓取列表页
C抽取詳細页链接
D抽取詳細页
我就直接上代码了
A
#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.15,v0.2
2010.10.07,v0.1
批量生成列表页链接
"""
import sys,os,time
list_url_template = ""
list_url_start = 1
list_url_end = 154
list_links_file = os.path.join("./","list_links.txt")
g_step=1
def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc
@cost_time
def show(list_url_start=list_url_start,\
list_url_end=list_url_end,\
list_url_template=list_url_template):
lines=[]
for i in xrange(list_url_start,list_url_end+1):
line="\n"%(i*g_step)
print line.rstrip()
lines.append(line)
open(list_links_file,"w").writelines(lines)
print "total count:%s"%len(lines)
print "done!"
#import os.path
#print os.path.abspath(".")
if __name__=="__main__":
l=len(sys.argv)
if l==1:
show()
elif l==2:
show(list_url_end=int(sys.argv[1]))
elif l==3:
show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]))
elif l==4:
show(list_url_start=int(sys.argv[1]),list_url_end=int(sys.argv[2]),list_url_template=sys.argv[3])
B
#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.3
2010.10.09,v0.2
2010.10.07,v0.1
批量抓取列表页
"""
from __future__ import with_statement
from __future__ import division
import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2
import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')
import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)
import time
import os
import os.path
import stat
import select
import shutil
import re
import gzip
import StringIO
list_list_folder = os.path.join("./","lists")
list_info_folder = os.path.join("./","lists_infos")
status_file = os.path.join("./","lists_infos/status.txt")
error_file = os.path.join("./","lists_infos/error.txt")
error_file_bak = os.path.join("./","lists_infos/error.txt.bak")
success_file = os.path.join("./","lists_infos/success.txt")
list_links_file = os.path.join("./","list_links.txt")
g_headers={}
g_pool_num = 5
def init():
if not os.path.exists(list_list_folder):
os.mkdir(list_list_folder)
if not os.path.exists(list_info_folder):
os.mkdir(list_info_folder)
print "完成初始化"
def delete(src):
'''delete files and folders'''
permission(src)
if os.path.isfile(src):
try:
os.remove(src)
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
except:
pass
def permission(src):
os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)
def clear():
delete(list_list_folder)
delete(list_info_folder)
print "还原为初始"
def size(src):
"检查文件或文件夹大小"
r = 0L
if os.path.isfile(src):
r=os.path.getsize(src)
else:
for root, dirs, files in os.walk(src):
r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
l=len(str(r))
if l>9:
r=r/1024/1024/1024
r="%.2f GiB"%r
elif l>6:
r=r/1024/1024
r="%.2f MiB"%r
elif l>3:
r=r/1024
r="%.2f KiB"%r
print "%s 大小为:%s"%(src,r)
def status(str):
"running/stop"
f=open(status_file,"w")
f.write(str)
f.close()
def error(url,ex):
f=open(error_file,"a")
f.write("%s\n"%(url,))
f.close()
def success(url):
f=open(success_file,"a")
f.write("%s\n"%url)
f.close()
def url2filename(url):
import base64
return base64.urlsafe_b64encode(url)
def url2filename2(url):
url=url.strip()
idx=url.rfind("/")
r=url[idx+1:]
if idx==-1 or len(r)==0:
# raise ValueError("url2filename function parser error")
print "启用特殊url2filename"
r = re.findall(r"\d+", url)[-1]
return r
def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc
def statistics(func):
def tongji():
total,successed=0,0
if os.path.exists(list_links_file):
total=len(set(open(list_links_file,"r").readlines()))
print "total lines:%s"%total
if os.path.exists(success_file):
successed=len(set(open(success_file,"r").readlines()))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
def newFunc(*args,**args2):
tongji()
back = func(*args, **args2)
tongji()
return back
return newFunc
def get_html(url):
def do(url):
html=""
try:
req = urllib2.Request(url = url,headers = g_headers)
html = urllib2.urlopen(req).read()
return html
except Exception,e:
print url,"error",e
error(url,e)
return None
rr = with_timeout(10, do, url, timeout_value=None)
return rr
def get_html22(url):
import types
def do(url):
html=""
try:
req = urllib2.Request(url = url,headers = g_headers)
html = urllib2.urlopen(req).read()
t=type(html)
if t==types.StringTypes or t==types.UnicodeType:
return html
else:
print url,"error======"
return ""
except Exception,e1:
pdata = StringIO.StringIO(rr)#下面6行是实现解压缩
gzipper = gzip.GzipFile(fileobj = pdata)
try:
html = gzipper.read()
return html
except Exception,e2:
print url,e1,e2
error(url,e1)
return ""
rr = with_timeout(10, do, url, timeout_value="")
return rr
def get_html2(url):
"when use gzipped page will get fetch error"
#print url
with httppool.item() as http:
#eventlet.sleep(0)
resp, content = http.request(url)
print content
return content
def save_html2file(filename,html):
f=open(filename,"w")
f.write(html)
f.close()
def save_url2file(url):
#html=""
#try:
# html=get_html(url)
#except Exception,e:
# print url,"fetch error",e
# error(url,e)
# return
html=get_html(url)
if html is not None and html<>"":
filename=os.path.join(list_list_folder,url2filename(url))
save_html2file(filename,html)
if os.path.getsize(filename)<1024*20:
error(url,"size小于%s"%(1024*20))
print url,"error"
return
success(url)#以成功的为基准,剩下的都是不成功的或未执行的
print url,"success"
else:
print url,"error"
error(url,"html为None或为空")
@cost_time
@statistics
def batch_get_html(urls):
print "执行批量下载网页工作"
pool=eventlet.GreenPool(g_pool_num)
for url in urls:
pool.spawn_n(save_url2file,url)
pool.waitall()
print "done!"
def process_continue():
"接着success抓取剩下的部分"
#读取完整的部分和已完成的部分进行取非交集合
done=set(open(success_file,"r").read().split("\n"))
all=set(open(list_links_file,"r").read().split("\n"))
left=all-done
batch_get_html(left)
if __name__=="__main__":
init()
l=len(sys.argv)
if l==1:
content=""
if not select.select([sys.stdin,],[],[],0.0)[0]:
print "load from %s"%list_links_file
content=open(list_links_file,"r").read()
else:
print "load from stdin"
content=sys.stdin.read()
urls=content.strip().split("\n")
#print urls
batch_get_html(urls)
size(list_list_folder)
elif l==2:
argv=sys.argv[1]
if argv=="clear":
clear()
if argv=="continue":
process_continue()
elif l==3:
argv=sys.argv[1]
if argv=="load":
url=sys.argv[2]
print url
save_url2file(url)
print "done!"
C
#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.22
2010.10.11,v0.21
2010.10.09,v0.2
2010.10.07,v0.1
从列表页抽取详细页的链接和缩略图链接的脚本
"""
import sys
import re
import os.path
list_list_folder = os.path.join("./","lists")
success_file = os.path.join("./","lists_infos/success.txt")
detail_links_file = os.path.join("./","extract_detail_links.txt")
#g_pattern=r"""
D
#!/usr/bin/env python
#encoding=utf-8
"""
2010.10.16,v0.13
2010.10.15,v0.12
2010.10.13,v0.11
2010.10.07,v0.1
批量抓取详细页
"""
from __future__ import with_statement
from __future__ import division
import socket as original_socket
original_socket.setdefaulttimeout(10)
from eventlet.timeout import with_timeout
from eventlet.green import urllib2
from urlparse import urljoin
import sys
####reload(sys)
####sys.setdefaultencoding('utf-8')
import eventlet
from eventlet import pools
#httplib2 = eventlet.import_patched('httplib2')
#httppool = pools.Pool(create=lambda: httplib2.Http(timeout=90),max_size=20)
import time
import os
import os.path
import stat
import select
g_host = ""
g_data_folder = os.path.join("./","details")
g_info_folder = os.path.join("./","details_infos")
g_status_file = os.path.join("./","details_infos/status.txt")
g_error_file = os.path.join("./","details_infos/error.txt")
g_success_file = os.path.join("./","details_infos/success.txt")
g_result_links_file = os.path.join("./","extract_detail_links.txt")
g_pool_num = 1
g_headers={}
headers = """"""
headers = headers.strip().replace("\r\n","\n")
if headers<>"":
for elem in headers.split("\n"):
if elem.strip()=="":
continue
a,b=elem.split(":",1)
a=a.strip()
b=b.strip()
g_headers[a]=b
def init():
if not os.path.exists(g_data_folder):
os.mkdir(g_data_folder)
if not os.path.exists(g_info_folder):
os.mkdir(g_info_folder)
print "完成初始化"
def delete(src):
'''delete files and folders'''
permission(src)
if os.path.isfile(src):
try:
os.remove(src)
except:
pass
elif os.path.isdir(src):
for item in os.listdir(src):
itemsrc=os.path.join(src,item)
delete(itemsrc)
try:
os.rmdir(src)
except:
pass
def permission(src):
os.chmod(src,stat.S_IRWXU|stat.S_IRWXO|stat.S_IRWXG)
def clear():
delete(g_data_folder)
delete(g_info_folder)
print "还原为初始"
def size(src):
"检查文件或文件夹大小"
r = 0L
if os.path.isfile(src):
r=os.path.getsize(src)
else:
for root, dirs, files in os.walk(src):
r += sum([os.path.getsize(os.path.join(root, name)) for name in files])
l=len(str(r))
if l>9:
r=r/1024/1024/1024
r="%.2f GiB"%r
elif l>6:
r=r/1024/1024
r="%.2f MiB"%r
elif l>3:
r=r/1024
r="%.2f KiB"%r
print "%s 大小为:%s"%(src,r)
def status(str):
"running/stop"
f=open(g_status_file,"w")
f.write(str)
f.close()
def error(url,ex):
f=open(g_error_file,"a")
f.write("%s\n"%(url,))
f.close()
def success(url):
f=open(g_success_file,"a")
f.write("%s\n"%url)
f.close()
def url2filename(url):
import base64
return base64.urlsafe_b64encode(url)
def url2filename2(url):
url=url.strip()
idx=url.rfind("/")
r=url[idx+1:]
if idx==-1 or len(r)==0:
# raise ValueError("url2filename function parser error")
print "启用特殊url2filename"
r = re.findall(r"\d+", url)[-1]
return r
def statistics(func):
def tongji():
total,successed=0,0
if os.path.exists(g_result_links_file):
total=len(set(open(g_result_links_file,"r").readlines()))
print "total lines:%s"%total
if os.path.exists(g_success_file):
successed=len(set(open(g_success_file,"r").readlines()))
print "successed lines:%s"%successed
print "left lines:%s"%(total-successed)
def newFunc(*args,**args2):
tongji()
back = func(*args, **args2)
tongji()
return back
return newFunc
def cost_time(func):
def newFunc(*args, **args2):
t0 = time.time()
print "@%s, {%s} start" % (time.strftime("%X", time.localtime()), func.__name__)
back = func(*args, **args2)
print "@%s, {%s} end" % (time.strftime("%X", time.localtime()), func.__name__)
print "@%.3fs taken for {%s}" % (time.time() - t0, func.__name__)
return back
return newFunc
def get_html(url):
def do(url):
html=""
try:
req = urllib2.Request(url = url,headers = g_headers)
html = urllib2.urlopen(req).read()
return html
except Exception,e:
print url,"error",e
error(url,e)
return None
rr = with_timeout(10, do, url, timeout_value=None)
return rr
def get_html2(url):
#print url
with httppool.item() as http:
#eventlet.sleep(0)
resp, content = http.request(url,'GET',headers=g_headers)
#resp, content = http.request(url)
return content
def save_html2file(filename,html):
f=open(filename,"w")
f.write(html)
f.close()
def save_url2file(url):
a,b=url.strip().split(",")
if not a.startswith("http://"):
a=urljoin(g_host,a)
#a=a.replace("&","&")
html=get_html(a)
if html is not None and html<>"":
filename=os.path.join(g_data_folder,url2filename(a))
save_html2file(filename,html)
if os.path.getsize(filename)<1024*10:
error(url,"size小于%s"%(1024*10))
print url,"error"
return
success(url)#以成功的为基准,剩下的都是不成功的或未执行的
print url,"success"
else:
print url,"error"
error(url,"html为None或为空")
def save_url2file2(url):
a,b=url.strip().split(",")
if not a.startswith("http://"):
a=urljoin(g_host,a)
html=""
try:
html=get_html(a)
except Exception,e:
print url,e,"fetch error"
error(url,e)
return
if html<>"":
filename=os.path.join(g_data_folder,url2filename(a))
save_html2file(filename,html)
if os.path.getsize(filename)<1024*10: error(url,"size小于%s"%(1024*10)) print url,"error" return success(url)#以成功的为基准,剩下的都是不成功的或未执行的 print url,"success" @cost_time @statistics def batch_get_html(urls): print "执行批量下载网页工作" pool=eventlet.GreenPool(g_pool_num) for url in urls: pool.spawn_n(save_url2file,url) pool.waitall() size(g_data_folder) print "done!" def count(): total,successed=set(),set() if os.path.exists(g_success_file): successed=set(open(g_success_file,"r").read().strip().split("\n")) if os.path.exists(g_result_links_file): total=set(open(g_result_links_file,"r").read().strip().split("\n")) left=total-successed return total,successed,left def process_continue(): "接着success抓取剩下的部分" #读取完整的部分和已完成的部分进行取非交集合 total,successed,left=count() batch_get_html(left) def process_forever(): "循环处理,直到全部完成" total,successed,left=count() print "left" while len(left)>0:
print "由于还没未完成页面,再次循环执行"
process_continue()
total,successed,left=count()
if __name__=="__main__":
init()
l=len(sys.argv)
if l==1:
content=""
if not select.select([sys.stdin,],[],[],0.0)[0]:
print "load from %s"%g_result_links_file
content=open(g_result_links_file,"r").read()
else:
print "load from stdin"
content=sys.stdin.read()
urls=content.strip().split("\n")
#print urls
batch_get_html(urls)
elif l==2:
argv=sys.argv[1]
if argv=="clear":
clear()
if argv=="continue":
process_continue()
if argv=="loop":
process_forever()
elif l==3:
if sys.argv[1]=="load":
url=sys.argv[2]
save_url2file(url)
print "done!"