-
!coding=utf-8
-
import requests
-
import re
-
import random
-
import time
-
import json from requests.packages.urllib3.exceptions
-
import InsecureRequestWarning
-
import pandas as pd requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告
-
class tm(object):####手机端
-
def __init__(self,path): ###保存数据路径
-
self.path=path
-
def goodsid(self,url): ###通过店铺URL获取店铺所有ID
-
shopname = re.search('https://(.*?).tmall', url).group(1)
-
searchurl = 'https://{}.m.tmall.com/shop/shop_auction_search.do?spm=a1z60.7754813.0.0.301755f0pZ1GjU&sort=defaul'.format(
-
shopname)
-
s=requests.session()
-
headers = {'Accept': '*/*',
-
'Accept-Language': 'zh-CN',
-
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G60 Safari/603.3.8',
-
'Referer':'https://{}.m.tmall.com/shop/shop_auction_search.htm?spm=a1z60.7754813.0.0.301755f0pZ1GjU&sort=default'.format(shopname)
-
}
-
s.headers.update(headers)
-
page1=s.get(url=searchurl,verify=False).text
-
print(page1)
-
js=json.loads(page1)
-
total_page=int(js['total_page'])
-
shop_id=js['shop_id']
-
shop_title = js['shop_title']
-
shop_id_list = []
-
shop_title_list = []
-
item_id=re.findall('"item_id":(.*?),"',page1)
-
title=re.findall('"title":"(.*?)","',page1)
-
sold=re.findall('"sold":"(.*?)","',page1)
-
totalSoldQuantity=re.findall('"totalSoldQuantity":(.*?),"',page1)
-
skuurl=re.findall('"url":"(.*?)","',page1)
-
price=re.findall('"price":"(.*?)","',page1)
-
item_id_l=len(item_id)
-
shop_id_list.append(shop_id)
-
shop_id_list.extend(shop_id_list*(int(item_id_l)-1))
-
shop_title_list.append(shop_title)
-
shop_title_list.extend(shop_title_list*(int(item_id_l)-1))
-
# print(js)
-
# print(len(shop_id_list))
-
# print(len(shop_title_list))
-
# print(len(item_id))
-
# print(len(title))
-
# print(len(sold))
-
# print(len(totalSoldQuantity))
-
# print(len(skuurl))
-
# print(len(price))
-
data = {'shop_id': shop_id_list,'shop_title': shop_title_list,'item_id': item_id, 'title': title, 'sold':sold, 'totalSoldQuantity':totalSoldQuantity, 'skuurl':skuurl, 'price':price}
-
df = pd.DataFrame(data=data)
-
#print(df)
-
savepath=self.path + r'\tmgoodsid{}.csv'.format(shopname)
-
print(savepath)
-
df.to_csv(savepath, mode='a', index=False, encoding="GB18030")
-
time.sleep(random.random() * 2)
-
if total_page!=1:
-
for i in range(2,total_page+1):
-
time.sleep(random.random() * 2)
-
htmlurl=searchurl+'&p={}'.format(i)
-
html=s.get(url=htmlurl,verify=False).text
-
shop_id_list = []
-
shop_title_list = []
-
print(html)
-
item_id = re.findall('"item_id":(.*?),"',html)
-
title = re.findall('"title":"(.*?)","', html)
-
sold = re.findall('"sold":"(.*?)","', html)
-
totalSoldQuantity = re.findall('"totalSoldQuantity":(.*?),"', html)
-
skuurl = re.findall('"url":"(.*?)","', html)
-
price = re.findall('"price":"(.*?)","',html)
-
item_id_l = len(item_id)
-
shop_id_list.append(shop_id)
-
shop_id_list.extend(shop_id_list * (int(item_id_l) - 1))
-
shop_title_list.append(shop_title)
-
shop_title_list.extend(shop_title_list * (int(item_id_l) - 1))
-
data = {'shop_id': shop_id_list, 'shop_title': shop_title_list, 'item_id': item_id, 'title': title,
-
'sold': sold, 'totalSoldQuantity': totalSoldQuantity, 'skuurl': skuurl, 'price': price}
-
df = pd.DataFrame(data=data)
-
df.to_csv(self.path + r'\tmgoodsid{}.csv'.format(shopname),mode='a', index=False,header=0 ,encoding="GB18030")
-
df1 = pd.read_csv(self.path + r'\tmgoodsid{}.csv'.format(shopname), encoding='GB18030')
-
s.close()
-
return df1
-
def getiddata(self,id): ###获取ID数据
-
time.sleep(random.random() * 1 + 1)
-
s = requests.session()
-
t=int(time.time()*1000)
-
url='https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/' \
-
'?jsv=2.4.8&appKey=12574478&t={}' \
-
'&sign=7c9e1dedaa295fdb175d22c99746493b&api=mtop.taobao.detail.getdetail' \
-
'&v=6.0&dataType=jsonp&ttid=2017%40taobao_h5_6.6.0&AntiCreep=true&type=jsonp&callback=mtopjsonp2&' \
-
'data=%7B%22itemNumId%22%3A%22{}%22%7D'.format(t,id)
-
headers = {'Accept': '*/*',
-
'Accept-Language': 'zh-CN',
-
'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G60 Safari/603.3.8',
-
'Referer': 'https://detail.m.tmall.com/item.htm?spm=a220m.6910245.0.0.55b17434eiwv4f&id={}'.format(id)
-
}
-
print(url)
-
s.headers.update(headers)
-
html = s.get(url=url, verify=False).text
-
html=html.replace('\\','')
-
time.sleep(0.5)
-
info=re.search('skuBase":(.*?),"skuCore',html)
-
if info!=None:
-
skuBase=re.search('skuBase":(.*?),"skuCore',html).group(1) ##SKU+颜色
-
skuId = re.findall('"skuId":"(.*?)","', skuBase)
-
propPath=re.findall('"propPath":"(.*?)"}',skuBase)
-
skuBase=json.loads(skuBase)
-
prop_list=[]
-
for i in propPath:
-
prop = ''
-
prop1=i.split(';')
-
for j in prop1:
-
prop2=j.split(':')
-
for pid in skuBase['props']:
-
if pid['pid']==prop2[0]:
-
#prop=prop+pid['name']
-
for vid in pid['values']:
-
if vid['vid']==prop2[1]:
-
prop=prop+vid['name']
-
prop_list.append(str(prop))
-
sku2info = re.search('"sku2info":(.*?)},"s', html).group(1) ##价格
-
sku2info = json.loads(sku2info)
-
price = []
-
for i in skuId:
-
p = sku2info[str(i)]['price']['priceText']
-
price.append(p)
-
else:
-
skuId=[' ']
-
prop_list=[' ']
-
price=[' ']
-
data = {'skuid': skuId, 'prop': prop_list,'price':price}
-
df = pd.DataFrame(data=data)
-
return df
-
def iddata(self,id_df):
-
df_l=id_df.iloc[:,0].size
-
df=pd.DataFrame()
-
df.loc[0, "shop_id"] = ''
-
df.loc[:, "shop_title"] = ''
-
df.loc[:, "item_id"] = ''
-
df.loc[:, "title"] = ''
-
df.loc[:, "sold"] = ''
-
df.loc[:, "totalSoldQuantity"] = ''
-
df.loc[:, "skuurl"] = ''
-
df.loc[:, "price"] = ''
-
df.loc[:, "skuid"] = ''
-
df.loc[:, "prop"] = ''
-
df.loc[:, "skuprice"] = ''
-
shopid=id_df['shop_id'][1]
-
y=0
-
for i in range(0,df_l):
-
time.sleep(random.random() * 2.56)
-
pid=id_df['item_id'][i]
-
data=self.getiddata(pid)
-
data_l=data.iloc[:,0].size
-
for j in range(0,data_l):
-
df.at[y, "shop_id"] = id_df['shop_id'][i]
-
df.at[y, "shop_title"] = id_df['shop_title'][i]
-
df.at[y, "item_id"] = id_df['item_id'][i]
-
df.at[y, "title"] = id_df['title'][i]
-
df.at[y, "sold"] = id_df['sold'][i]
-
df.at[y, "totalSoldQuantity"] = id_df['totalSoldQuantity'][i]
-
df.at[y, "skuurl"] = id_df['skuurl'][i]
-
df.at[y, "price"] = id_df['price'][i]
-
df.at[y, "skuid"] = data['skuid'][j]
-
df.at[y, "prop"] = data['prop'][j]
-
df.at[y, "skuprice"] = data['price'][j]
-
y +=1
-
df.to_csv(self.path + r'\tm{}.csv'.format(shopid), index=False, encoding="GB18030")
-
return df
-
def urlitem(self,url,*args): ##通过目录获取 只适合部分
-
s = requests.session()
-
headers = {'Accept': '*/*',
-
'Accept-Language': 'zh-CN',
-
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36'
-
}
-
s.headers.update(headers)
-
itemhtml = s.get(url=url, verify=False).text
-
#print(itemhtml)
-
shopid = re.search('class="J_TModule"(.*?)"搜索列表"', itemhtml).group(1)
-
shopid=re.search('data-widgetid="(.*?)" id',shopid).group(1)
-
#print(shopid)
-
id=re.search('category-(.*?).htm',url).group(1)
-
nm=re.search('https://(.*?).tmall.com/',url).group(1)
-
t=int(time.time()*1000)
-
pageurl='https://{}.tmall.com/i/asynSearch.htm?_ksTS={}_888&callback=jsonp289&mid=w-{}-0&wid={}&path=/category-{}.htm'.format(nm,t,shopid,shopid,id)
-
print(pageurl)
-
time.sleep(random.random() * 1 + 1)
-
html = s.get(url=pageurl, verify=False).text
-
html = html.replace('\\', '')
-
html=re.sub('\n','',html)
-
page=re.search('ui-page-s-len">1/(.*?)</b>',html).group(1)
-
print(page)
-
nm_list=[]
-
idurl_list=[]
-
price_list=[]
-
sale_list=[]
-
for p in range(1,int(page)+1):
-
time.sleep(random.random())
-
pageurl = 'https://{}.tmall.com/i/asynSearch.htm?_ksTS={}_888&callback=jsonp289&mid=w-{}-0&wid={}&path=/category-{}.htm'.format(
-
nm, t, shopid, shopid, id)
-
html = s.get(url=pageurl, verify=False).text
-
html = html.replace('\\', '')
-
html = re.sub('\n', '', html)
-
print(html)
-
nm=re.findall('<img alt="(.*?)" data',html)[:-8]
-
print(nm)
-
id=re.findall('<a href="//detail.(.*?)&rn',html)
-
idurl=[]
-
for i in id:
-
idurl.append('https://detail.'+i)
-
price=re.findall('class="c-price">(.*?) ',html)[:-8]
-
sale=re.findall('sale-num">(.*?)</span>',html)[:-8]
-
nm_list.extend(nm)
-
idurl_list.extend(idurl)
-
price_list.extend(price)
-
sale_list.extend(sale)
-
print(len(nm_list))
-
print(len(idurl_list))
-
print(len(price_list))
-
print(len(sale_list))
-
data={'nm':nm_list,'idurl':idurl_list,'price':price_list,'sale':sale_list}
-
df=pd.DataFrame(data)
-
l=len(args)
-
for j in range(0,l):
-
df.loc[:, "col"+str(j)] = args[j]
-
print(df)
-
s.close()
-
return df
-
# 例子:
-
# tm = tm()
-
# url = 'https://shoushanggeshi.tmall.com/category-1310604910.htm'
-
# # url = 'https://shoushanggeshi.tmall.com/category-674950482.htm'
-
# tm.urlitem(url, '电脑', 'cpu')
-
if __name__=='__main__':
-
path=r'E:\tm'
-
tm=tm(path)
-
df=tm.goodsid('https://intel.tmall.com')
-
tm.iddata(df)
可点原文链接直接跳转
GitHub :https://github.com/linyhuan/Crawler/blob/master/tmmall.py