老虎社区
'https://www.laohu8.com/stock/'
百度股票不行了
import requests
import re
from bs4 import BeautifulSoup
import collections
import traceback
def getHtmlText(url):
try:
kv = {'user-agent':'Mozilla/5.0'}
r = requests.get(url,headers = kv)
# print(r.status_code)
r.raise_for_status()
#print(r.apparent_encoding) GB2312
#r.encoding = r.apparent_encoding() GB2312 无法获取信息
r.encoding = 'utf-8'#这步必须要,可以省时。因为获取r.apparent_encoding解析文本内容需要很长的时间
return r.text
except:
return ""
def getstocklist(list,stock_url):
html = getHtmlText(stock_url)
soup = BeautifulSoup(html,'html.parser')
a = soup.find_all('a')
for i in a:
try:
href = i.attrs['href']
list.append(re.findall(r"\d{6}",href)[0]) #查找股票代码
except:
continue
print(len(list))
def getstockinfo(list,stock_url,path):
cnt = 0
for stock in list:
url = stock_url+stock
html = getHtmlText(url)
try:
if html == '':
continue
infodict = collections.OrderedDict()#为了后面按照插入顺序写入文件
soup = BeautifulSoup(html,'html.parser')
stock_name = soup.find_all('h1',attrs = {'class':'name'})[0]
name = stock_name.text.split()[0]
infodict['股票名称'] = name
stockinfo = soup.find('div',attrs = {'class':'detail-data'})
key_list = stockinfo.find_all('dt')
value_list = stockinfo.find_all('dd')
for i in range(len(key_list)):
key = key_list[i].text
value = value_list[i].text
infodict[key] = value
with open(path,'a',encoding='utf-8') as f:#'a':新的内容会加到已有内容的后面
f.write(str(infodict)+'\n')
cnt = cnt+1
print('\r当前进度:{:.2f}%'.format(cnt*100/len(list)),end='')#\r 表示将光标的位置回退到本行的开头位置
except:
cnt = cnt +1
print('\r当前进度:{:.2f}%'.format(cnt*100/len(list)),end='')
continue
def main():
stock_list_url = 'http://quote.eastmoney.com/stock_list.html'
stock_info_url = 'https://www.laohu8.com/stock/'
output_file = 'laohu_stock.txt'
list = []
getstocklist(list,stock_list_url)
getstockinfo(list,stock_info_url,output_file)
main()
getstockinfo():
getstockinfo
laohu_stock.txt 部分截图
注意:
可以看到爬取到的信息里没有最高最低信息,这是因为我直接爬取的网页源代码,源代码里没有显示出最高最低信息,但是正常网页显示了最高最低,具体原因还在了解。