上面一篇被ban了,重写,针对爬虫首先声明只是哥玩具爬虫,得到自己的所有博客地址,然后随机访问;

思想很简单,包含了2个类IPSpyder和类,前者保证一周内get一次ip代理到本地,后者包含3个方法负责随机读取博客,getBlogList()方法的输入是个人博客的主页地址,输出是个人博客所有的链接,getBlogTitleAndCount()的输入时单个博客的url地址,拿到当前博客的访问量和标题,输出;

 IP代理的爬虫参考:爬取IP代理

import requests
import lxml
from bs4 import BeautifulSoup
import os
import string
import random
import time
import aiohttp
import asyncio
from tqdm import tqdm
import os
import datetime
 
 
class IPSpyder(object):
    def __init__(self):
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'}
        self.sixsix_url_range = 35
        self.kaixin_url_range = 2
        self.kuai_url_range = 2
        self.ip_list_all = []
        self.ip_ok_list_all = []
        self.url = '
        self.ip_avaliable_file = 'F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt'
 
    def get_html(self, url, flag):
        try:
            headers = self.headers
            response = requests.get(url, headers=headers)
            response.raise_for_status()
            if flag:
                response.encoding = 'utf-8'
            else:
                response.encoding = 'gb2312'
            return response.text
        except Exception as err:
            return '请求异常'
 
    def get_66ip(self):
        #ip_list = []
        for index in range(1, self.sixsix_url_range):
            count = 0
            province = ''
            url = 'http://www.66ip.cn/areaindex_{}/1.html'.format(index)
            html = self.get_html(url, flag=False)
            soup = BeautifulSoup(html, 'lxml')
            tr_list = soup.find_all(name='tr')
            for tr_ in tr_list[2:]:
                td_list = tr_.find_all(name='td')
                ip = td_list[0].string
                port = td_list[1].string
                province = td_list[2].string
                ip_port = ip + ':' + port
                self.ip_list_all.append(ip_port)
                count += 1
            print('Saved {0} {1} ip.'.format(province, count))
            # 速度不要太快哦!, 否则获取不到页面内容
            time.sleep(3)
        print('66 daili Finished!!!')
 
    def get_kaixinip(self):
        #ip_list = []
        for index in range(1, self.kaixin_url_range):
            count = 0
            url = 'http://www.kxdaili.com/dailiip/1/{}.html'.format(index)
            html = self.get_html(url, False)
            soup = BeautifulSoup(html, 'lxml')
            tr_list = soup.find_all(name='tr')
            for tr_ in tr_list[2:]:
                td_list = tr_.find_all(name='td')
                ip = td_list[0].string
                port = td_list[1].string
                ip_port = ip + ':' + port
                self.ip_list_all.append(ip_port)
                count += 1
            print('Saved {0} page {1} ip.'.format(index, count))
            # 速度不要太快哦!, 否则获取不到页面内容
            time.sleep(3)
        print('kaixindaili Finished!!!')
 
    def get_goubanjiaip(self):
        #ip_list = []
        url = 'http://www.goubanjia.com/'
        html = self.get_html(url, False)
        soup = BeautifulSoup(html, 'lxml')
        td_list = soup.find_all(class_='ip')
        for td_ in td_list:
            ip_ = ''
            for child in td_.children:
                if child == ':':
                    ip_ += child
                elif not child.attrs:
                    ip_ += child.get_text()
                elif list(child.attrs.keys())[0] == 'class':
                    ip_ = ip_ + child.get_text()
                elif child.attrs['style'] == 'display:inline-block;' or child.attrs['style'] == 'display: inline-block;':
                    ip_ += child.get_text()
            self.ip_list_all.append(ip_)
        print('quanwang daili Finished!!!')
 
    # 快代理
    def get_kuaidaili(self):
        #ip_list = []
        for index in range(1, self.kuai_url_range):
            count = 0
            url = 'https://www.kuaidaili.com/free/inha/{}/'.format(index)
            html = self.get_html(url, False)
            soup = BeautifulSoup(html, 'lxml')
            tr_list = soup.find_all(name='tr')
            for tr_ in tr_list[1:]:
                td_list = tr_.find_all(name='td')
                ip = td_list[0].string
                port = td_list[1].string
                ip_port = ip + ':' + port
                self.ip_list_all.append(ip_port)
                count += 1
            print('Saved {0} page {1} ip.'.format(index, count))
            # 速度不要太快哦!, 否则获取不到页面内容
            time.sleep(3)
        print('kuaidaili Finished!!!')
 
    async def test_ip(self, ip_, url):
        #global ip_ok
        conn = aiohttp.TCPConnector(verify_ssl=False)
        async with aiohttp.ClientSession(connector=conn) as session:
            try:
                proxy_ip = 'http://' + ip_
                print('正在测试: ' + proxy_ip)
                async with session.get(url=url, headers=self.headers, proxy=proxy_ip, timeout=15) as response:
                    if response.status == 200:
                        print('代理可用: ' + ip_)
                        self.ip_ok_list_all.append(ip_)
                    else:
                        print('请求响应码不合法 ' + ip_)
            except:
                print('代理请求失败', ip_)
 
    def run_test_ip_write_to_file(self):
        #csdn 点赞关注私聊发^-^
        print('csdn 点赞关注私聊发')
 
 
# 我的博客列表,后面要跟翻页list/1
# 我的博客列表有几页?
# header
# 定义一个类
 
class CSDN(object):
    # 类的静态变量
    def __init__(self):
    
        self.my_list = 5
        self.csdn_url = ''
        self.proxies = [{'http': 'socks5://183.195.106.118:8118'}]
        self.blogList = []
        self.headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) '
                        'AppleWebKit/537.36 (KHTML, like Gecko) Chrome/71.0.3578.98 Safari/537.36'
                        }
 
#   从博客首页进去,遍历得到我的博客列表,把博客地址塞进self.blogList[]
    def getBlogList(self):
        i = 1
        print('-------------------------------begin----------------------------')
        while(i <= self.my_list):
            response = requests.get(self.my_csdn+str(i), headers=self.headers)
            response.enconding = 'utf-8'
            conent = response.content.decode('utf-8')
            soup = BeautifulSoup(conent, 'lxml')
            a_tag_content = soup.findAll('a')
            for a_tag in a_tag_content:
                a_tag_content = str(a_tag.get('href'))
                if('details' in a_tag_content and 'comments' not in a_tag_content):
                    self.blogList.append(a_tag_content)
                    #print (a_tag_content)
            print('Success, already append ' +
                  str(len(self.blogList)) + ' to the blogList!')
            i = i+1
        # print (self.blogList)
 
#   随机遍历self.blogList[]里面的博客链接,得到博客的标题和次数,并输出
    def getBlogTitleAndCount(self, proxy):
        proxy_support = {
            'http': 'http://'+proxy,
            'https': 'https://'+proxy,
        }
        response = requests.get(
            self.csdn_url, headers=self.headers, proxies=proxy_support)
        response.enconding = 'utf-8'
        conent = response.content.decode('utf-8')
        soup = BeautifulSoup(conent, 'lxml')
 
        # 得到当前博客的标题:数据挖掘算法和实践(二十一):kaggle经典-职场离职率分析案例解读
        blog_title = soup.title.string
        # 得到当前博客的访问量统计值,显示出来
        blog_counts = soup.find_all('span')
        for blog_count in blog_counts:
            blog_count_single_class = blog_count.get('class')
            if(blog_count_single_class is not None and blog_count_single_class[0] == 'read-count'):
                blog_count_now = blog_count.string
        print('当前读取的博客地址是:【'+self.csdn_url+'】\n' +
              '当前读取的博客地址是:【'+blog_title + '】\n' +
              '当前使用的代理IP是:【'+proxy + '】\n' +
              '当前博客的阅读统计是:【_' + blog_count_now + '_次】')
 
    def beginTO(self, proxy):
        self.getBlogList()
        self.csdn_url = random.choice(self.blogList)
        self.getBlogTitleAndCount(proxy)
        #random_time=random.uniform(sleepTimeMin, sleepTimeMax)
        #print("Begin to sleep now,Sleep time: "+str(random_time))
        # time.sleep(random_time)
        self.blogList = []
 
 
# 逻辑开始,首先判定本地的可用ip文件的创建戳是不是超过1周或者文件是空,若是就重新刷新,不然直接开始刷;
ip_avaliable = "F:/2020-11-04/csdn_sypder_20210124/ip_avaliable.txt"
mtime = os.stat(ip_avaliable).st_ctime
# 如果文件存在,并且创建时间是7天内,并且非空
if(not os.path.exists(ip_avaliable) or ((time.time()-mtime)/(3600*24) > 7) or not os.path.getsize(ip_avaliable)):
    # 先刷代理后刷博客
    ipSpyder = IPSpyder()
    ipSpyder.get_66ip()
    #ipSpyder.get_kaixinip()
    #ipSpyder.get_goubanjiaip()
    #ipSpyder.get_kuaidaili()
    ipSpyder.run_test_ip_write_to_file()
# 直接调用开始刷
file_ip = open(ip_avaliable, 'r')
ip_avaliable_list = file_ip.read().split(",")
file_ip.close()
# print(ip_avaliable_list)
proxy_now = random.choice(ip_avaliable_list)
csdn = CSDN()
while True:
    print('csdn 点赞关注私聊发')
    csdn.beginTO(proxy_now)
    time.sleep(10)
    #csdn 点赞关注私聊发^-^
 
 
#ipSpyder =IPSpyder()
# ipSpyder.get_66ip()
# ipSpyder.get_kaixinip()
# ipSpyder.get_goubanjiaip()
# ipSpyder.get_kuaidaili()
# ipSpyder.run_test_ip()
#
# time.localtime(statinfo)
#print ('得到了一系列的IP代理,总共有 '+str(len(ipSpyder.ip_list_all))+' 个;')
#print ('经过测试总共有 '+str(len(ipSpyder.ip_ok_list_all))+' 个IP代理可用;')
#file = open("ip_avaliable.txt", 'w')
# file.write(ip_ok_list_all)
# file.close()

输出的范例如下:

Success, already append 48 to the blogList!
Success, already append 96 to the blogList!
Success, already append 144 to the blogList!
Success, already append 192 to the blogList!
Success, already append 211 to the blogList!
当前读取的博客地址是:【数据挖掘算法和实践(一):线性回归和逻辑回归(house_price数据集)_叶子叶来-
当前使用的代理IP是:【211.144.213.145:80】
当前博客的阅读统计是:【_351_次】

python爬虫(五):提高博客访问量(ip proxy)_xml

效果图

增加linux运行中的crontab设置:

#!/bin/bash
###################################################
#@name      :check_python_spider.sh
#@caption   :crontab脚本检查python爬虫是否;
#@AUTHOR    :yzg
#@create-date 2021-02-02
#@mender
#@modify_desc  :每过五分钟运行一次,重启爬虫
#*/5 * * * * sh /home/yzg/csdn/check_python_spider.sh
###################################################
#set enviorment
export LANG=en_US.utf8
#date_time=log_"`date +%Y%m%d%H%M`"
date_time=log_"`date +%Y%m%d`"

cd ${shell_path}
if [ $? -eq 0 ]; then 
    echo `date +"%Y-%m-%d %H:%M:%S"`"切换文件夹成功!"${shell_path}>>${log_path}${date_time}
	cunt=` ps -ef |grep IPSpyder_for_linux |wc -l`
	echo $cunt
	if [ $cunt -ne 2 ]; then 
	    nohup python -u IPSpyder_for_linux.py > spider.out.1 &
	fi
else 
    echo `date +"%Y-%m-%d %H:%M:%S"`"切换文件夹失败!"${shell_path}>>${log_path}${date_time}
fi