1、爬取百度
import requests
keyword = "Python"
try:
kv = {'wd':keyword}
r = requests.get("http://www.baidu.com/s",params=kv)
print(r.request.url)
r.raise_for_status()
print(r.text)
except:
print("爬取失败")
2、查询IP地址
import requests
url = "https://ip138.com/iplookup.asp?ip="
kv={'user-agent':'Mozilla/5.0'}
try:
r= requests.get(url+'ip'+'&action=2','202.204.80.112',headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[-500:])
except:
print("失败")
3、股票爬取
import requests
from bs4 import BeautifulSoup
import traceback
import re
def getHTMLText(url):
try:
r=requests.get(url)
r.raise_for_ststus()
r.encoding=r.apparent_encoding
return r.text
except:
return""
def getStockList(lst,stockURL):
html=getHTMLText(stockURL)
soup=BeautifulSoup(html,'html.parser')
a=soup.find_all('a')
for i in a:
try:
href=i.attrs['href']
lst.append(re.finall(r"[s][hz]\d{6}",href)[0])
except:
continue
return ""
def getStockInfo(lst,stockURL,fpath):
count=0
timel=time.time()
for stock in lst:
url = stockURL + stock
html = getHTMLText(url)
try:
if html == "":
continue
infoDict = {}
soup = BeautifulSoup(html,'html.parser')
stockInfo = soup.find('header',attrs = {'class':'stock_title'})
name = stockInfo.find_all(attrs = {'h1':''})[0]
infoDict.update({'股票名称':name.text.split('</h1>')[0]})
stockInfo = soup.find('div',attrs = {'class':'stock_top clearfix'})
keyList = stockInfo.find_all('dt')
valueList = stockInfo.find_all('dd')
for i in range(len(keyList)):
key = keyList[i].text
val = valueList[i].text
infoDict[key] = val
with open(fpath,'a',encoding = 'utf-8') as f:
f.write(str(infoDict) + '\n')
count = count + 1
time2 = time.time() - time1
print('\r当前速度:{:.2f}%\t用时:{:.2f}秒'.format((count * 100 /len(lst)),time2),end ='')
except:
count = count + 1
time2 = time.time() - time1
print('\r当前速度:{:.2f}%\t用时:{:.2f}秒'.format((count * 100 /len(lst)),time2),end ='')
traceback.print_exc()
continue
return ""
def main():
stock_list_url = 'https://hq.gucheng.com/gpdmylb.html'
stock_info_url = 'https://hq.gucheng.com/'
output_file = 'C:\\StockInfo.txt'
slist = []
getStockList(slist,stock_list_url)
getStockInfo(slist,stock_info_url,output_file)
main()
4、爬取京东商品
import requests
url="http://item.jd.com/2967929.html"
try:
r=requests.get(url)
r.raise_for_status()
r.encoding=r.apparent_encoding
print(r.text[:1000])
except:
print("爬取失败")
5、爬取图片
import requests
import os
url= "http://pic.87g.com/upload/2020/0102/20200102093322295.jpg"
root= "C://Users//15133//Pictures//Camera Roll"
path=root+url.split('/')[-1]
try:
if not os.path.exists(root):
os.mkdir(root)
if not os.path.exists(path):
r=requests.get(url)
with open(path,'wb') as f:
f.write(r.content)
f.close()
print("成功")
else:
print("存在")
except:
print("失败")
6、图表库练习
import matplotlib.pyplot as plt
labels= 'Frogs', 'Hogs', 'Dogs', 'Logs'
sizes = [15, 30,45, 10]
explode = (0, 0.1,0, 0)
plt.pie(sizes, explode=explode, labels=labels, autopct= '%1.1f%%',shadow=False, startangle=90)
plt. show()
7、图片转水墨画
from PIL import Image
import numpy as np
a = np.asarray(Image.open('C:\Users\15133\Pictures\Saved Pictures\QQ图片20191029201945.jpg').convert('L')).astype('float')
depth = 10. # (0-100)
grad = np.gradient(a) #取图像灰度的梯度值
grad_x, grad_y = grad #分别取横纵图像梯度值
grad_x = grad_x*depth/100.
grad_y = grad_y*depth/100.
A = np.sqrt(grad_x**2 + grad_y**2 + 1.)
uni_x = grad_x/A
uni_y = grad_y/A
uni_z = 1./A
vec_el = np.pi/2.2 # 光源的俯视角度,弧度值
vec_az = np.pi/4. # 光源的方位角度,弧度值
dx = np.cos(vec_el)*np.cos(vec_az) #光源对x 轴的影响
dy = np.cos(vec_el)*np.sin(vec_az) #光源对y 轴的影响
dz = np.sin(vec_el) #光源对z 轴的影响
b = 255*(dx*uni_x + dy*uni_y + dz*uni_z) #光源归一化
b = b.clip(0,255)
im = Image.fromarray(b.astype('uint8')) #重构图像
im.save('C:\Users\15133\Pictures\Saved Pictures\QQ图片20191029201945.jpg')
8、爬取高校排名
import requests
from bs4 import BeautifulSoup
import bs4
def getHTMLText(url) :
try:
r= requests.get(url,timeout=30)
r.raise_for_status()
r.encoding=r.apparent_encoding
return r.text
except:
return ""
def fillUnivList(ulist,html):
soup=BeautifulSoup(html,"html.parser")
for tr in soup.find('tbody').children:
if isinstance(tr,bs4.element.Tag):
tds=tr('td')
ulist.append([tds[0].string,tds[1].string,tds[3].string])
def printUnivList(ulist,num):
print("{:^10}\t{:^6}\t{:^10}".format("排名","学校","总分"))
for i in range(num):
u=ulist[i]
print("{:^10}\t{:^6}\t{:^10}".format(u[0],u[1],u[2]))
def main():
uinfo=[]
url='http://www.zuihaodaxue.cn/zuihaodaxuepaiming2019.html'
html=getHTMLText(url)
fillUnivList(uinfo,html)
printUnivList(uinfo,20)# 20 unives
main()
9、亚马逊商品爬取
import requests
url = "https://www.amazon.cn/gp/product/B01M8L5Z3Y"
try:
kv = {'user-agent':'Mozilla/5.0'}
r = requests.get(url,headers=kv)
r.raise_for_status()
r.encoding = r.apparent_encoding
print(r.text[1000:2000])
except:
print("爬取失败")