Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间。
Beaufiful Soup也是解析网页内容最好的工具之一,解析内容大多数是通过选择器来实现的,这两天刚好进行一些爬虫实验,遇到一些瓶颈,干脆一股脑的再把beautifulsoup再看一下,后续会讲解一些爬虫实例。
代码示例
#"""
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie1</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<div data-foo="abc">foo!</div>
<div data-foo="bcd">foo!</div>
<div data-data="bcd">data-data-value!</div>
<p class="body strikeout"></p>
<p class="story">...</p>
"""
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'lxml')
print('soup=',soup)
print('-----------------------------find_all-----------------------------')
print('soup.find_all("title")=',soup.find_all("title")) #直接查找tag
#[<title>The Dormouse's story</title>]
print('soup.find_all("p", "title")=',soup.find_all("p", "title")) #直接查找tag和属性值
#[<p class="title"><b>The Dormouse's story</b></p>]
print('soup.find_all("a")=',soup.find_all("a")) #直接查找标签,返回<class 'bs4.element.ResultSet'>
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
#<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
#<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print('soup.find_all(id="link2")=',soup.find_all(id="link2")) #直接查找ID
#[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]
print('soup.find_all(attrs={"data-foo": "abc"}=',soup.find_all(attrs={"data-foo": "abc"})) #直接查找属性和属性值
print('soup.find_all("a", class_="sister")=',soup.find_all("a", class_="sister")) #直接查找tag和class值
print('soup.find_all("p", class_="strikeout")=',soup.find_all("p", class_="strikeout")) #same with last
print('soup.find_all("p", class_="body")=',soup.find_all("p", class_="body")) #same with last
print('soup.find_all("p", class_="body strikeout")=',soup.find_all("p", class_="body strikeout")) #same with last
print('soup.select("p.strikeout.body")=',soup.select("p.strikeout.body")) #直接查找tag和class值
print('soup.find_all("a", string="Elsie1")=',soup.find_all("a", string="Elsie1")) #直接查找tag包含的text值
print('soup.find_all("a", text="Elsie1")=',soup.find_all("a", text="Elsie1")) #same withlast
print('--------------------------CSS selectors---------------------------')
#el#id: 元素+ID,比如: div#logo
#el.class: 元素+class,比如: div.masthead
#el[attr]: 元素+class,比如: a[href]
#任意组合,比如:a[href].highlight
#ancestor child: 查找某个元素下子元素,比如:可以用.body p 查找在”body”元素下的所有 p元素
#parent > child: 查找某个父元素下的直接子元素,比如:可以用div.content > p 查找 p 元素,也可以用body > * 查找body标签下所有直接子元素
#siblingA + siblingB: 查找在A元素之前第一个同级元素B,比如:div.head + div
#siblingA ~ siblingX: 查找A元素之前的同级X元素,比如:h1 ~ p
#el, el, el:多个选择器组合,查找匹配任一选择器的唯一元素,例如:div.masthead, div.logo
#You can find tags:
print('soup.select("title")=',soup.select("title")) #直接查找tag值
print('soup.select("p:nth-of-type(3)")=',soup.select("p:nth-of-type(3)")) #直接查找第N个tag值
#Find tags beneath other tags:
print('soup.select("body a")=',soup.select("body a")) #直接查找tag下所有的tag值
print('soup.select("p a")=',soup.select("p a")) #same withlast
print('soup.select("html head title")=',soup.select("html head title")) #same withlast
#Find tags directly beneath other tags:
print('soup.select("body > a")=',soup.select("body > a")) #直接查找tag下直接的tag值
print('soup.select("p > a")=',soup.select("p > a")) #same withlast
print('soup.select("html > head > title")=',soup.select("html > head > title")) #same withlast
print('soup.select("p > a:nth-of-type(2)")=',soup.select("p > a:nth-of-type(2)")) #same withlast
print('soup.select("p > #link1")=',soup.select("p > #link1")) #直接查找tag下的ID值
#Find the siblings of tags:
print('soup.select("#link1 ~ .sister")=',soup.select("#link1 ~ .sister")) #查找A元素之前的同级X元素
print('soup.select("#link1 + .sister")=',soup.select("#link1 + .sister")) #查找在A元素之前第一个同级元素B
#Find tags by CSS class:
print('soup.select(".sister")=',soup.select(".sister")) #直接查找所有class内容,不匹配tag
print('soup.select("[class~=sister]")=',soup.select("[class~=sister]")) #查找A元素之前的同级X元素
#Find tags by ID:
print('soup.select("#link1")=',soup.select("#link1")) #查找ID值
print('soup.select("a#link2")=',soup.select("a#link2")) #same with last
#Find tags that match any selector from a list of selectors:
print('soup.select("#link1,#link2")=',soup.select("#link1,#link2")) #same with last
#Test for the existence of an attribute:
print('soup.select(a[href])=',soup.select('a[href]')) #直接查找tag和属性
#Find tags by attribute value:
print('soup.select(a[href="http://example.com/elsie"])=',soup.select('a[href="http://example.com/elsie"]')) #直接查找tag和属性及属性值
print('soup.select(a[href^="http://example.com/"])=',soup.select('a[href^="http://example.com/"]')) #利用属性名前缀来查找元素
print('soup.select(a[href$="tillie"])=',soup.select('a[href$="tillie"]')) #利用属性后缀查找元素
print('soup.select(a[href*=".com/el"])=',soup.select('a[href*=".com/el"]')) #利用属性模糊匹配查找元素
print('soup.find_all(attrs={"data-foo": True}=',soup.find_all(attrs={"data-foo":True})) #直接查找带属性名称的元素
#[<div data-foo="abc">foo!</div>, <div data-foo="bcd">foo!</div>]
print('soup.find_all(attrs={"data-data": True}=',soup.find_all(attrs={"data-data":True})) #same with last,返回集合
#[<div data-data="bcd">data-data-value!</div>]
print('soup.find(attrs={"data-data": True})=',soup.find(attrs={"data-data":True})) #直接查找带属性名称的元素返回单条
#[<div data-data="bcd">data-data-value!</div>]
print('soup.find(attrs={"data-data": True})["data-data"]=',soup.find(attrs={"data-data":True})['data-data']) #直接查找带属性名称的元素返回单条,并返回属性值
print('soup.find_all("a", class_="sister")=',soup.find_all("a", class_="sister"))
#[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]
print('soup.find("a", class_="sister")=',soup.find("a", class_="sister"))
#<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
print('type(soup.find_all("a", class_="sister"))=',type(soup.find_all("a", class_="sister"))) #<class 'bs4.element.ResultSet'>
print('type(soup.find("a", class_="sister"))=',type(soup.find("a", class_="sister"))) #<class 'bs4.element.Tag'>