链接验证

原创

辉晖飛 2018-02-10 17:22:41 博主文章分类：Python ©著作权

©著作权归作者所有：来自51CTO博客作者辉晖飛的原创作品，请联系作者获取转载授权，否则将追究法律责任

用途：对给定的网页URL，区分可以和不可用的二级链接

#!/usr/bin/env python    
# -*- coding: utf-8 -*-
import time
import requests
import bs4


url = raw_input('请输入需要验证的链接：')
headers = {'user-agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36'}
#提取出网站主页
url_list = url.split('/')
r_url = '/'.join(url_list[:3])
try:
    res = requests.get(url, headers = headers, timeout=3)
    res.raise_for_status()
    #关闭多余的连接
    res.keep_alive = False
    soup = bs4.BeautifulSoup(res.text, 'lxml')
    a_list = soup.select('a')
    for a_link in a_list:
        href = a_link.get('href')
        #跳过空的链接
        if href == None:
            continue
        #把URL的相对路径转化为绝对路径
        if href.startswith('http') == False:
            href = r_url + href
        try:
            a = requests.get(href, headers = headers, timeout=3)
            if a.status_code == 200:
                print(('%s is ok') % (href))
            a.raise_for_status()
        except:
            print(('%s is bad') % (href))
        time.sleep(2)
except Exception,e:
    print('输入的链接不可用')
    print(e)