Beautiful Soup 是一个可以从HTML或XML文件中提取数据的Python库.它能够通过你喜欢的转换器实现惯用的文档导航,查找,修改文档的方式.Beautiful Soup会帮你节省数小时甚至数天的工作时间。

Beaufiful Soup也是解析网页内容最好的工具之一,解析内容大多数是通过选择器来实现的,这两天刚好进行一些爬虫实验,遇到一些瓶颈,干脆一股脑的再把beautifulsoup再看一下,后续会讲解一些爬虫实例。



代码示例

  1. #"""

  2. html_doc = """

  3. <html><head><title>The Dormouse's story</title></head>

  4. <body>

  5. <p class="title"><b>The Dormouse's story</b></p>

  6. <p class="story">Once upon a time there were three little sisters; and their names were

  7. <a href="http://example.com/elsie" class="sister" id="link1">Elsie1</a>,

  8. <a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and

  9. <a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;

  10. and they lived at the bottom of a well.</p>

  11. <div data-foo="abc">foo!</div>

  12. <div data-foo="bcd">foo!</div>

  13. <div data-data="bcd">data-data-value!</div>

  14. <p class="body strikeout"></p>

  15. <p class="story">...</p>

  16. """

  17. from bs4 import BeautifulSoup

  18. soup = BeautifulSoup(html_doc, 'lxml')

  19. print('soup=',soup)

  20. print('-----------------------------find_all-----------------------------')

  21. print('soup.find_all("title")=',soup.find_all("title")) #直接查找tag

  22. #[<title>The Dormouse's story</title>]

  23. print('soup.find_all("p", "title")=',soup.find_all("p", "title")) #直接查找tag和属性值

  24. #[<p class="title"><b>The Dormouse's story</b></p>]

  25. print('soup.find_all("a")=',soup.find_all("a")) #直接查找标签,返回<class 'bs4.element.ResultSet'>

  26. #[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

  27. #<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

  28. #<a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

  29. print('soup.find_all(id="link2")=',soup.find_all(id="link2")) #直接查找ID

  30. #[<a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>]

  31. print('soup.find_all(attrs={"data-foo": "abc"}=',soup.find_all(attrs={"data-foo": "abc"})) #直接查找属性和属性值

  32. print('soup.find_all("a", class_="sister")=',soup.find_all("a", class_="sister")) #直接查找tagclass

  33. print('soup.find_all("p", class_="strikeout")=',soup.find_all("p", class_="strikeout")) #same with last

  34. print('soup.find_all("p", class_="body")=',soup.find_all("p", class_="body")) #same with last

  35. print('soup.find_all("p", class_="body strikeout")=',soup.find_all("p", class_="body strikeout")) #same with last

  36. print('soup.select("p.strikeout.body")=',soup.select("p.strikeout.body")) #直接查找tagclass

  37. print('soup.find_all("a", string="Elsie1")=',soup.find_all("a", string="Elsie1")) #直接查找tag包含的text

  38. print('soup.find_all("a", text="Elsie1")=',soup.find_all("a", text="Elsie1")) #same withlast

  39. print('--------------------------CSS selectors---------------------------')

  40. #el#id: 元素+ID,比如: div#logo

  41. #el.class: 元素+class,比如: div.masthead

  42. #el[attr]: 元素+class,比如: a[href]

  43. #任意组合,比如:a[href].highlight

  44. #ancestor child: 查找某个元素下子元素,比如:可以用.body p 查找在”body”元素下的所有 p元素

  45. #parent > child: 查找某个父元素下的直接子元素,比如:可以用div.content > p 查找 p 元素,也可以用body > * 查找body标签下所有直接子元素

  46. #siblingA + siblingB: 查找在A元素之前第一个同级元素B,比如:div.head + div

  47. #siblingA ~ siblingX: 查找A元素之前的同级X元素,比如:h1 ~ p

  48. #el, el, el:多个选择器组合,查找匹配任一选择器的唯一元素,例如:div.masthead, div.logo

  49. #You can find tags:

  50. print('soup.select("title")=',soup.select("title")) #直接查找tag

  51. print('soup.select("p:nth-of-type(3)")=',soup.select("p:nth-of-type(3)")) #直接查找第Ntag

  52. #Find tags beneath other tags:

  53. print('soup.select("body a")=',soup.select("body a")) #直接查找tag下所有的tag

  54. print('soup.select("p a")=',soup.select("p a")) #same withlast

  55. print('soup.select("html head title")=',soup.select("html head title")) #same withlast

  56. #Find tags directly beneath other tags:

  57. print('soup.select("body > a")=',soup.select("body > a")) #直接查找tag下直接的tag

  58. print('soup.select("p > a")=',soup.select("p > a")) #same withlast

  59. print('soup.select("html > head > title")=',soup.select("html > head > title")) #same withlast

  60. print('soup.select("p > a:nth-of-type(2)")=',soup.select("p > a:nth-of-type(2)")) #same withlast

  61. print('soup.select("p > #link1")=',soup.select("p > #link1")) #直接查找tag下的ID

  62. #Find the siblings of tags:

  63. print('soup.select("#link1 ~ .sister")=',soup.select("#link1 ~ .sister")) #查找A元素之前的同级X元素

  64. print('soup.select("#link1 + .sister")=',soup.select("#link1 + .sister")) #查找在A元素之前第一个同级元素B

  65. #Find tags by CSS class:

  66. print('soup.select(".sister")=',soup.select(".sister")) #直接查找所有class内容,不匹配tag

  67. print('soup.select("[class~=sister]")=',soup.select("[class~=sister]")) #查找A元素之前的同级X元素

  68. #Find tags by ID:

  69. print('soup.select("#link1")=',soup.select("#link1")) #查找ID

  70. print('soup.select("a#link2")=',soup.select("a#link2")) #same with last

  71. #Find tags that match any selector from a list of selectors:

  72. print('soup.select("#link1,#link2")=',soup.select("#link1,#link2")) #same with last

  73. #Test for the existence of an attribute:

  74. print('soup.select(a[href])=',soup.select('a[href]')) #直接查找tag和属性

  75. #Find tags by attribute value:

  76. print('soup.select(a[href="http://example.com/elsie"])=',soup.select('a[href="http://example.com/elsie"]')) #直接查找tag和属性及属性值

  77. print('soup.select(a[href^="http://example.com/"])=',soup.select('a[href^="http://example.com/"]')) #利用属性名前缀来查找元素

  78. print('soup.select(a[href$="tillie"])=',soup.select('a[href$="tillie"]')) #利用属性后缀查找元素

  79. print('soup.select(a[href*=".com/el"])=',soup.select('a[href*=".com/el"]')) #利用属性模糊匹配查找元素

  80. print('soup.find_all(attrs={"data-foo": True}=',soup.find_all(attrs={"data-foo":True})) #直接查找带属性名称的元素

  81. #[<div data-foo="abc">foo!</div>, <div data-foo="bcd">foo!</div>]

  82. print('soup.find_all(attrs={"data-data": True}=',soup.find_all(attrs={"data-data":True})) #same with last,返回集合

  83. #[<div data-data="bcd">data-data-value!</div>]

  84. print('soup.find(attrs={"data-data": True})=',soup.find(attrs={"data-data":True})) #直接查找带属性名称的元素返回单条

  85. #[<div data-data="bcd">data-data-value!</div>]

  86. print('soup.find(attrs={"data-data": True})["data-data"]=',soup.find(attrs={"data-data":True})['data-data']) #直接查找带属性名称的元素返回单条,并返回属性值

  87. print('soup.find_all("a", class_="sister")=',soup.find_all("a", class_="sister"))

  88. #[<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,

  89. # <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,

  90. # <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>]

  91. print('soup.find("a", class_="sister")=',soup.find("a", class_="sister"))

  92. #<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>

  93. print('type(soup.find_all("a", class_="sister"))=',type(soup.find_all("a", class_="sister"))) #<class 'bs4.element.ResultSet'>

  94. print('type(soup.find("a", class_="sister"))=',type(soup.find("a", class_="sister"))) #<class 'bs4.element.Tag'>