1 不管用requests_html,还是requests获取网页的源码时,发现通过ajax动态加载的内容都获取不到,得通过分析动态加载的接口去重新请求数据,有时很不方便。

2 下面我们利用 selenium+phantomjs 来实现一次性获取网页上所有的内容;

1. 下载Phantomjs,下载地址:https://phantomjs.org/download.html 选择下载windows的还是linux的 
2. 下完之后直接解压就OK了,然后selenium的安装用pip就行了

代码:

import requests
from lxml import etree
from lxml import html
from html.parser import HTMLParser #导入html解析库
from selenium import webdriver
import time

def getHTMLText(url):
driver = webdriver.PhantomJS(executable_path='E:\\pythontest\\phantomjs-2.1.1-windows\\bin\\phantomjs') # phantomjs的绝对路径
time.sleep(2)
driver.get(url) # 获取网页
time.sleep(2)
return driver.page_source

def getHtmlByXpath(html_str,xpath):
strhtml = etree.HTML(html_str)
strResult = strhtml.xpath(xpath)
return strResult

def w_file(filepath,contents):
with open(filepath,'w',encoding='gb18030') as wf:
wf.write(contents)



def main():
url = 'https://m.fygdrs.com/h5/news.html?t=2&id=67062' #要访问的网址
strhtml = getHTMLText(url) #获取HTML
#print(html)
w_file('E:\\pythontest\\wfile.txt',strhtml)
strDiv=getHtmlByXpath(strhtml,"//div[@id='Article-content']")
if(strDiv):
str1= html.tostring(strDiv[0])
print(str1)
str2 = HTMLParser().unescape(str1.decode())
print(str2)
w_file('E:\\pythontest\\wfile3.txt',str2)

print('ok')


if __name__ == '__main__':
main()

 

--- end ---