1. !coding=utf-8
  2. import requests
  3. import re
  4. import random
  5. import time
  6. import json from requests.packages.urllib3.exceptions
  7. import InsecureRequestWarning
  8. import pandas as pd requests.packages.urllib3.disable_warnings(InsecureRequestWarning) ###禁止提醒SSL警告​
  9. class tm(object):####手机端
  10. def __init__(self,path): ###保存数据路径​
  11. self.path=path
  12. def goodsid(self,url): ###通过店铺URL获取店铺所有ID​
  13. shopname = re.search('https://(.*?).tmall', url).group(1)
  14. searchurl = 'https://{}.m.tmall.com/shop/shop_auction_search.do?spm=a1z60.7754813.0.0.301755f0pZ1GjU&sort=defaul'.format(
  15. shopname)
  16. s=requests.session()
  17. headers = {'Accept': '*/*',
  18. 'Accept-Language': 'zh-CN',
  19. 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G60 Safari/603.3.8',
  20. 'Referer':'https://{}.m.tmall.com/shop/shop_auction_search.htm?spm=a1z60.7754813.0.0.301755f0pZ1GjU&sort=default'.format(shopname)
  21. }
  22. s.headers.update(headers)
  23. page1=s.get(url=searchurl,verify=False).text
  24. print(page1)
  25. js=json.loads(page1)
  26. total_page=int(js['total_page'])
  27. shop_id=js['shop_id']
  28. shop_title = js['shop_title']
  29. shop_id_list = []
  30. shop_title_list = []
  31. item_id=re.findall('"item_id":(.*?),"',page1)
  32. title=re.findall('"title":"(.*?)","',page1)
  33. sold=re.findall('"sold":"(.*?)","',page1)
  34. totalSoldQuantity=re.findall('"totalSoldQuantity":(.*?),"',page1)
  35. skuurl=re.findall('"url":"(.*?)","',page1)
  36. price=re.findall('"price":"(.*?)","',page1)
  37. item_id_l=len(item_id)
  38. shop_id_list.append(shop_id)
  39. shop_id_list.extend(shop_id_list*(int(item_id_l)-1))
  40. shop_title_list.append(shop_title)
  41. shop_title_list.extend(shop_title_list*(int(item_id_l)-1))
  42. # print(js)​
  43. # print(len(shop_id_list))​
  44. # print(len(shop_title_list))​
  45. # print(len(item_id))​
  46. # print(len(title))​
  47. # print(len(sold))​
  48. # print(len(totalSoldQuantity))​
  49. # print(len(skuurl))​
  50. # print(len(price))​
  51. data = {'shop_id': shop_id_list,'shop_title': shop_title_list,'item_id': item_id, 'title': title, 'sold':sold, 'totalSoldQuantity':totalSoldQuantity, 'skuurl':skuurl, 'price':price}
  52. df = pd.DataFrame(data=data)
  53. #print(df)​
  54. savepath=self.path + r'\tmgoodsid{}.csv'.format(shopname)
  55. print(savepath)
  56. df.to_csv(savepath, mode='a', index=False, encoding="GB18030")
  57. time.sleep(random.random() * 2)
  58. if total_page!=1:
  59. for i in range(2,total_page+1):
  60. time.sleep(random.random() * 2)
  61. htmlurl=searchurl+'&p={}'.format(i)
  62. html=s.get(url=htmlurl,verify=False).text
  63. shop_id_list = []
  64. shop_title_list = []
  65. print(html)
  66. item_id = re.findall('"item_id":(.*?),"',html)
  67. title = re.findall('"title":"(.*?)","', html)
  68. sold = re.findall('"sold":"(.*?)","', html)
  69. totalSoldQuantity = re.findall('"totalSoldQuantity":(.*?),"', html)
  70. skuurl = re.findall('"url":"(.*?)","', html)
  71. price = re.findall('"price":"(.*?)","',html)
  72. item_id_l = len(item_id)
  73. shop_id_list.append(shop_id)
  74. shop_id_list.extend(shop_id_list * (int(item_id_l) - 1))
  75. shop_title_list.append(shop_title)
  76. shop_title_list.extend(shop_title_list * (int(item_id_l) - 1))
  77. data = {'shop_id': shop_id_list, 'shop_title': shop_title_list, 'item_id': item_id, 'title': title,
  78. 'sold': sold, 'totalSoldQuantity': totalSoldQuantity, 'skuurl': skuurl, 'price': price}
  79. df = pd.DataFrame(data=data)
  80. df.to_csv(self.path + r'\tmgoodsid{}.csv'.format(shopname),mode='a', index=False,header=0 ,encoding="GB18030")
  81. df1 = pd.read_csv(self.path + r'\tmgoodsid{}.csv'.format(shopname), encoding='GB18030')
  82. s.close()
  83. return df1
  84. def getiddata(self,id): ###获取ID数据​
  85. time.sleep(random.random() * 1 + 1)
  86. s = requests.session()
  87. t=int(time.time()*1000)
  88. url='https://h5api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/' \
  89. '?jsv=2.4.8&appKey=12574478&t={}' \
  90. '&sign=7c9e1dedaa295fdb175d22c99746493b&api=mtop.taobao.detail.getdetail' \
  91. '&v=6.0&dataType=jsonp&ttid=2017%40taobao_h5_6.6.0&AntiCreep=true&type=jsonp&callback=mtopjsonp2&' \
  92. 'data=%7B%22itemNumId%22%3A%22{}%22%7D'.format(t,id)
  93. headers = {'Accept': '*/*',
  94. 'Accept-Language': 'zh-CN',
  95. 'User-Agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 10_3_3 like Mac OS X) AppleWebKit/603.3.8 (KHTML, like Gecko) FxiOS/10.6b8836 Mobile/14G60 Safari/603.3.8',
  96. 'Referer': 'https://detail.m.tmall.com/item.htm?spm=a220m.6910245.0.0.55b17434eiwv4f&id={}'.format(id)
  97. }
  98. print(url)
  99. s.headers.update(headers)
  100. html = s.get(url=url, verify=False).text
  101. html=html.replace('\\','')
  102. time.sleep(0.5)
  103. info=re.search('skuBase":(.*?),"skuCore',html)
  104. if info!=None:
  105. skuBase=re.search('skuBase":(.*?),"skuCore',html).group(1) ##SKU+颜色​
  106. skuId = re.findall('"skuId":"(.*?)","', skuBase)
  107. propPath=re.findall('"propPath":"(.*?)"}',skuBase)
  108. skuBase=json.loads(skuBase)
  109. prop_list=[]
  110. for i in propPath:
  111. prop = ''
  112. prop1=i.split(';')
  113. for j in prop1:
  114. prop2=j.split(':')
  115. for pid in skuBase['props']:
  116. if pid['pid']==prop2[0]:
  117. #prop=prop+pid['name']​
  118. for vid in pid['values']:
  119. if vid['vid']==prop2[1]:
  120. prop=prop+vid['name']
  121. prop_list.append(str(prop))
  122. sku2info = re.search('"sku2info":(.*?)},"s', html).group(1) ##价格​
  123. sku2info = json.loads(sku2info)
  124. price = []
  125. for i in skuId:
  126. p = sku2info[str(i)]['price']['priceText']
  127. price.append(p)
  128. else:
  129. skuId=[' ']
  130. prop_list=[' ']
  131. price=[' ']
  132. data = {'skuid': skuId, 'prop': prop_list,'price':price}
  133. df = pd.DataFrame(data=data)
  134. return df
  135. def iddata(self,id_df):
  136. df_l=id_df.iloc[:,0].size
  137. df=pd.DataFrame()
  138. df.loc[0, "shop_id"] = ''
  139. df.loc[:, "shop_title"] = ''
  140. df.loc[:, "item_id"] = ''
  141. df.loc[:, "title"] = ''
  142. df.loc[:, "sold"] = ''
  143. df.loc[:, "totalSoldQuantity"] = ''
  144. df.loc[:, "skuurl"] = ''
  145. df.loc[:, "price"] = ''
  146. df.loc[:, "skuid"] = ''
  147. df.loc[:, "prop"] = ''
  148. df.loc[:, "skuprice"] = ''
  149. shopid=id_df['shop_id'][1]
  150. y=0
  151. for i in range(0,df_l):
  152. time.sleep(random.random() * 2.56)
  153. pid=id_df['item_id'][i]
  154. data=self.getiddata(pid)
  155. data_l=data.iloc[:,0].size
  156. for j in range(0,data_l):
  157. df.at[y, "shop_id"] = id_df['shop_id'][i]
  158. df.at[y, "shop_title"] = id_df['shop_title'][i]
  159. df.at[y, "item_id"] = id_df['item_id'][i]
  160. df.at[y, "title"] = id_df['title'][i]
  161. df.at[y, "sold"] = id_df['sold'][i]
  162. df.at[y, "totalSoldQuantity"] = id_df['totalSoldQuantity'][i]
  163. df.at[y, "skuurl"] = id_df['skuurl'][i]
  164. df.at[y, "price"] = id_df['price'][i]
  165. df.at[y, "skuid"] = data['skuid'][j]
  166. df.at[y, "prop"] = data['prop'][j]
  167. df.at[y, "skuprice"] = data['price'][j]
  168. y +=1
  169. df.to_csv(self.path + r'\tm{}.csv'.format(shopid), index=False, encoding="GB18030")
  170. return df
  171. def urlitem(self,url,*args): ##通过目录获取 只适合部分​
  172. s = requests.session()
  173. headers = {'Accept': '*/*',
  174. 'Accept-Language': 'zh-CN',
  175. 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.15 Safari/537.36'
  176. }
  177. s.headers.update(headers)
  178. itemhtml = s.get(url=url, verify=False).text
  179. #print(itemhtml)​
  180. shopid = re.search('class="J_TModule"(.*?)"搜索列表"', itemhtml).group(1)
  181. shopid=re.search('data-widgetid="(.*?)" id',shopid).group(1)
  182. #print(shopid)​
  183. id=re.search('category-(.*?).htm',url).group(1)
  184. nm=re.search('https://(.*?).tmall.com/',url).group(1)
  185. t=int(time.time()*1000)
  186. pageurl='https://{}.tmall.com/i/asynSearch.htm?_ksTS={}_888&callback=jsonp289&mid=w-{}-0&wid={}&path=/category-{}.htm'.format(nm,t,shopid,shopid,id)
  187. print(pageurl)
  188. time.sleep(random.random() * 1 + 1)
  189. html = s.get(url=pageurl, verify=False).text
  190. html = html.replace('\\', '')
  191. html=re.sub('\n','',html)
  192. page=re.search('ui-page-s-len">1/(.*?)</b>',html).group(1)
  193. print(page)
  194. nm_list=[]
  195. idurl_list=[]
  196. price_list=[]
  197. sale_list=[]
  198. for p in range(1,int(page)+1):
  199. time.sleep(random.random())
  200. pageurl = 'https://{}.tmall.com/i/asynSearch.htm?_ksTS={}_888&callback=jsonp289&mid=w-{}-0&wid={}&path=/category-{}.htm'.format(
  201. nm, t, shopid, shopid, id)
  202. html = s.get(url=pageurl, verify=False).text
  203. html = html.replace('\\', '')
  204. html = re.sub('\n', '', html)
  205. print(html)
  206. nm=re.findall('<img alt="(.*?)" data',html)[:-8]
  207. print(nm)
  208. id=re.findall('<a href="//detail.(.*?)&rn',html)
  209. idurl=[]
  210. for i in id:
  211. idurl.append('https://detail.'+i)
  212. price=re.findall('class="c-price">(.*?) ',html)[:-8]
  213. sale=re.findall('sale-num">(.*?)</span>',html)[:-8]
  214. nm_list.extend(nm)
  215. idurl_list.extend(idurl)
  216. price_list.extend(price)
  217. sale_list.extend(sale)
  218. print(len(nm_list))
  219. print(len(idurl_list))
  220. print(len(price_list))
  221. print(len(sale_list))
  222. data={'nm':nm_list,'idurl':idurl_list,'price':price_list,'sale':sale_list}
  223. df=pd.DataFrame(data)
  224. l=len(args)
  225. for j in range(0,l):
  226. df.loc[:, "col"+str(j)] = args[j]
  227. print(df)
  228. s.close()
  229. return df
  230. # 例子:​
  231. # tm = tm()​
  232. # url = 'https://shoushanggeshi.tmall.com/category-1310604910.htm'​
  233. # # url = 'https://shoushanggeshi.tmall.com/category-674950482.htm'​
  234. # tm.urlitem(url, '电脑', 'cpu')​
  235. if __name__=='__main__':
  236. path=r'E:\tm'
  237. tm=tm(path)
  238. df=tm.goodsid('https://intel.tmall.com')
  239. tm.iddata(df)

可点原文链接直接跳转

GitHub :https://github.com/linyhuan/Crawler/blob/master/tmmall.py