用途:对给定的网页URL,区分可以和不可用的二级链接
#!/usr/bin/env python # -*- coding: utf-8 -*- import time import requests import bs4 url = raw_input('请输入需要验证的链接:') headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'} #提取出网站主页 url_list = url.split('/') r_url = '/'.join(url_list[:3]) try: res = requests.get(url, headers = headers, timeout=3) res.raise_for_status() #关闭多余的连接 res.keep_alive = False soup = bs4.BeautifulSoup(res.text, 'lxml') a_list = soup.select('a') for a_link in a_list: href = a_link.get('href') #跳过空的链接 if href == None: continue #把URL的相对路径转化为绝对路径 if href.startswith('http') == False: href = r_url + href try: a = requests.get(href, headers = headers, timeout=3) if a.status_code == 200: print(('%s is ok') % (href)) a.raise_for_status() except: print(('%s is bad') % (href)) time.sleep(2) except Exception,e: print('输入的链接不可用') print(e)