Python爬虫系列之微信小程序多线程爬取图书数据,存储至excel文件
一、代码实现
import requests
import json
import time
import xlrd
import xlwt
from xlutils.copy import copy
'''
@Author :王磊
@Date :2019/9/19
@Description:某微信小程序图书数据爬取
'''
class MYJSpider:
def __init__(self):
self.headers = {
"User-Agent": "Mozilla/5.0 (iPhone; CPU iPhone OS 12_3_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Mobile/15E148 MicroMessenger/7.0.5(0x17000523) NetType/WIFI Language/zh_CN",
"Authorization": "eyJhbGciOiJIUzI1NiIsInR5ssI6IkpXVCJ9.eyJpYXQiOjE1NjUwNzU3OTUsInVpZCI6IkpSbEd2ODRxalJXaiJ9.Q3vy5OWD7M-9cye7LdqBnZhbxnVe0Cqb0Tjz6msHNfA",
"Referer": "https://servicewechat.com/wx816dc6e826dcc6b5/197/page-frame.html"
}
self.excelPath = "c:/users/it1002/Desktop/crawler/books.xlsx"
self.excelTitle = ['id', '书名', '价格', '折后价', '一级类目', '二级类目']
def initExcel(self):
f = xlwt.Workbook()
sheet1 = f.add_sheet(u'double', cell_overwrite_ok=True)
for i in range(0, len(self.excelTitle)):
sheet1.write(0, i, self.excelTitle[i])
f.save(self.excelPath)
def getHTML(self, url):
while True:
try:
resp = requests.get(url, headers=self.headers)
return json.loads(resp.content.decode("utf-8"))
except Exception as e:
print(e)
continue
def writeExcel(self, data):
workbook = xlrd.open_workbook(self.excelPath)
sheets = workbook.sheet_names()
worksheet = workbook.sheet_by_name(sheets[0])
rows_old = worksheet.nrows
new_workbook = copy(workbook)
new_worksheet = new_workbook.get_sheet(0)
for j in range(0, len(data)):
try:
new_worksheet.write(rows_old, j, data[j])
except Exception as e:
continue
new_workbook.save(self.excelPath)
def getBookList(self, group_id, page):
url = "https://app.manyoujing.net/v2/goods/list/group?group_id=" + str(group_id) + "&order_by=0&page=" + str(page) + "&page_size=10"
return self.getHTML(url)
def getCategoryList(self):
url = "https://app.manyoujing.net/v2/user/goods/group?page=1&page_size=9999"
return self.getHTML(url)
def pipLine(self, book):
try:
data = []
try:
data.append(book['id'])
except Exception as e:
data.append("")
try:
data.append(book['name'])
except Exception as e:
data.append("")
try:
data.append(book['price'])
except Exception as e:
data.append("")
try:
data.append(book['discountPrice'])
except Exception as e:
data.append("")
try:
data.append(book['categoryName'])
except Exception as e:
data.append("")
try:
data.append(book['groupName'])
except Exception as e:
data.append("")
self.writeExcel(data)
except Exception as e:
pass
def main(self):
self.initExcel()
cateoryListResp = self.getCategoryList()
if cateoryListResp['code'] == 0:
categoryList = cateoryListResp['data']['groups']
for group in categoryList:
groupId = group['id']
groupName = group['name']
categoryName = group['category_name']
time.sleep(2)
metaBook = {}
metaBook['groupName'] = groupName
metaBook['categoryName'] = categoryName
currGroupIndexbookListResp = self.getBookList(groupId, 1)
if currGroupIndexbookListResp['code'] == 0:
totals = int(currGroupIndexbookListResp['data']['total'])
totalPages = totals // 10 if totals % 10 == 0 else (totals // 10) + 1
time.sleep(2)
for i in range(1, totalPages + 1):
currGroupBookListResp = self.getBookList(groupId, i)
if currGroupBookListResp['code'] == 0:
items = currGroupBookListResp['data']['items']
for book in items:
bookId = book['id']
bookName = book['title']
bookPrice = book['min_selling_price']
discountPrice = book['min_deduction_price']
metaBook['id'] = bookId
metaBook['name'] = bookName
metaBook['price'] = bookPrice
metaBook['discountPrice'] = discountPrice
self.pipLine(metaBook)
if __name__ == '__main__':
m = MYJSpider()
m.main()