大家好,我是咿哑呀。今天我教大家学习网络爬虫,这个爬虫能够爬取网页文本内容,提高大家浏览网页的速度。下面我把代码贴出,大家仔细研究:
from urllib import request, parsefrom urllib.parse import quoteimport stringimport chardetfrom bs4 import BeautifulSoupimport reimport time
class spider:def __init__(self, my_root_url, title_tag, con_tag, OutputFile):self.new_urls = set() # 待爬取的urlself.old_urls = set() # 已爬取的urlself.datas = [] # 存放搜集的数据self.add_new_url(my_root_url)count = 1while self.has_new_url():try:new_url = self.get_new_url()print("%d、爬取 %s" % (count, new_url))html_context = self.download(new_url)new_urls, new_data = self.get_new_urls_data(new_url, html_context, title_tag, con_tag)self.add_new_urls(new_urls)self.output_html(new_data, OutputFile)count += 1time.sleep(1)except():print("爬取失败")def download(self, new_url):if new_url is None:return Noneheaders = ("User-Agent","Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36 Edge/15.15063")opener = request.build_opener()opener.addheaders = [headers]request.install_opener(opener)url_ = quote(new_url, safe=string.printable)if request.urlopen(url_).getcode() != 200:return Noneelse:html = request.urlopen(url_).read()charset = chardet.detect(html)["encoding"]return html.decode(charset, "ignore")
def output_html(self, new_data, OutputFile):if new_data is None:returnself.datas.append(new_data)for data in self.datas:if data["title"] and data["con"] and data["url"]:fout = open(OutputFile+"\"+data["title"].replace(" ", "").replace("","")+".txt", "w", encoding="utf8")fout.write("%s" % data["title"].replace(" ", "").replace("",""))fout.write("%s" % data["con"].replace(" ", ""))fout.write("(来源:%s)" % data["url"])fout.close()def get_new_urls_data(self, page_url, html_context, title_tag, con_tag):if page_url is None or html_context is None:returnnew_urls = set()red_data = {}soup = BeautifulSoup(html_context, "html.parser")pat = re.compile(".htm|.asp")links = soup.find_all(href=pat)for link in links:if page_url not in link["href"]:new_url = link["href"]new_full_url = parse.urljoin(page_url, new_url)new_urls.add(new_full_url)red_data["url"] = page_urlif soup.find(class_=title_tag[0]):title_node = soup.find(class_=title_tag[0])if title_node.get_text():red_data["title"] = title_node.get_text()else:red_data["title"] = ""else:red_data["title"] = ""con_node = soup.find(class_=con_tag[0])if con_node:red_data["con"] = con_node.get_text()else:red_data["con"] = ""return new_urls, red_data
def add_new_url(self, my_root_url):if my_root_url is None:returnif my_root_url not in self.new_urls and my_root_url not in self.old_urls:self.new_urls.add(my_root_url)def add_new_urls(self, urls):if urls is None or len(urls) == 0:returnfor url in urls:self.add_new_url(url)def has_new_url(self):return len(self.new_urls) != 0def get_new_url(self):new_url = self.new_urls.pop()self.old_urls.add(new_url)return new_url
if __name__ == "__main__":root_url = "http://www.******.cn"#