Python爬虫系列之微信小程序多线程爬取图书数据,存储至excel文件

一、代码实现

import requests
import json
import time
import xlrd
import xlwt
from xlutils.copy import copy

'''
    @Author     :王磊
    @Date       :2019/9/19
    @Description:某微信小程序图书数据爬取
'''

class MYJSpider:
    def __init__(self):
        self.headers = {
            "User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.5(0x17000523) NetType/WIFI Language/zh_CN",
            "Authorization": "eyJhbGciOiJIUzI1NiIsInR5ssI6IkpXVCJ9.eyJpYXQiOjE1NjUwNzU3OTUsInVpZCI6IkpSbEd2ODRxalJXaiJ9.Q3vy5OWD7M-9cye7LdqBnZhbxnVe0Cqb0Tjz6msHNfA",
            "Referer": "https://servicewechat.com/wx816dc6e826dcc6b5/197/page-frame.html"
        }
        self.excelPath = "c:/users/it1002/Desktop/crawler/books.xlsx"
        self.excelTitle = ['id', '书名', '价格', '折后价', '一级类目', '二级类目']

    def initExcel(self):
        f = xlwt.Workbook()
        sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)
        for i in range(0, len(self.excelTitle)):
            sheet1.write(0, i, self.excelTitle[i])
        f.save(self.excelPath)

    def getHTML(self, url):
        while True:
            try:
                resp = requests.get(url, headers=self.headers)
                return json.loads(resp.content.decode("utf-8"))
            except Exception as e:
                print(e)
                continue

    def writeExcel(self, data):
        workbook = xlrd.open_workbook(self.excelPath)
        sheets = workbook.sheet_names()
        worksheet = workbook.sheet_by_name(sheets[0])
        rows_old = worksheet.nrows
        new_workbook = copy(workbook)
        new_worksheet = new_workbook.get_sheet(0)
        for j in range(0, len(data)):
            try:
                new_worksheet.write(rows_old, j, data[j])
            except Exception as e:
                continue
        new_workbook.save(self.excelPath)

    def getBookList(self, group_id, page):
        url = "https://app.manyoujing.net/v2/goods/list/group?group_id=" + str(group_id) + "&order_by=0&page=" + str(page) + "&page_size=10"
        return self.getHTML(url)

    def getCategoryList(self):
        url = "https://app.manyoujing.net/v2/user/goods/group?page=1&page_size=9999"
        return self.getHTML(url)

    def pipLine(self, book):
        try:
            data = []
            try:
                data.append(book['id'])
            except Exception as e:
                data.append("")
            try:
                data.append(book['name'])
            except Exception as e:
                data.append("")
            try:
                data.append(book['price'])
            except Exception as e:
                data.append("")
            try:
                data.append(book['discountPrice'])
            except Exception as e:
                data.append("")
            try:
                data.append(book['categoryName'])
            except Exception as e:
                data.append("")
            try:
                data.append(book['groupName'])
            except Exception as e:
                data.append("")
            self.writeExcel(data)
        except Exception as e:
            pass

    def main(self):
        self.initExcel()
        cateoryListResp = self.getCategoryList()
        if cateoryListResp['code'] == 0:
            categoryList = cateoryListResp['data']['groups']
            for group in categoryList:
                groupId = group['id']
                groupName = group['name']
                categoryName = group['category_name']
                time.sleep(2)
                metaBook = {}
                metaBook['groupName'] = groupName
                metaBook['categoryName'] = categoryName
                currGroupIndexbookListResp = self.getBookList(groupId, 1)
                if currGroupIndexbookListResp['code'] == 0:
                    totals = int(currGroupIndexbookListResp['data']['total'])
                    totalPages = totals // 10 if totals % 10 == 0 else (totals // 10) + 1
                    time.sleep(2)
                    for i in range(1, totalPages + 1):
                        currGroupBookListResp = self.getBookList(groupId, i)
                        if currGroupBookListResp['code'] == 0:
                            items = currGroupBookListResp['data']['items']
                            for book in items:
                                bookId = book['id']
                                bookName = book['title']
                                bookPrice = book['min_selling_price']
                                discountPrice = book['min_deduction_price']
                                metaBook['id'] = bookId
                                metaBook['name'] = bookName
                                metaBook['price'] = bookPrice
                                metaBook['discountPrice'] = discountPrice
                                self.pipLine(metaBook)


if __name__ == '__main__':
    m = MYJSpider()
    m.main()