BeautifulSoup库
0、一切办法都有的
from bs4 import BeautifulSoup
# 前面几个办法运用的都是这个参数,所以一致运用这个(后边的那些办法没有引证这个html文本文件)
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
1、根本用法
'''
根本用法demo1
'''
def demo01(html_doc):
# 这儿的作用是将html_doc中短少的标签补充完善,运用的库是lxml进行补全
soup = BeautifulSoup(html_doc, "lxml")
# 更正html_doc的格局,使得上面文本的格局是正确的
print(soup.prettify())
# 检查通过上面过程处理过后的成果
print(soup.title.string)
2、节点挑选器
'''
节点挑选器demo2
'''
def demo02(html_doc):
soup = BeautifulSoup(html_doc, 'lxml')
# 挑选html_doc中的title标签
# 成果:<title>The Dormouse's story</title>
print(soup.title)
# 检查对应的类型
# 成果:<class 'bs4.element.Tag'>
print(type(soup.title))
# 成果:The Dormouse's story
print(soup.title.string)
# 成果:<head><title>The Dormouse's story</title></head>
print(soup.head)
# 成果:<p class="title"><b>The Dormouse's story</b></p>
print(soup.p)
# 成果:<class 'bs4.element.Tag'>
print(type(soup.p))
# 成果:<a class="sister" href="http://example.com/elsie" id="link1">Elsie</a> 【默认回来第一个】
print(soup.a)
3、提取节点信息
'''
提取节点信息demo3
'''
def demo03(html_doc):
soup = BeautifulSoup(html_doc, "lxml")
# <a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
tag = soup.a
# 1、获取称号
# 成果:a
print(tag.name)
# 2、获取特点值
# 成果:
# class值为: ['sister']
# href值为: http://example.com/elsie
print("class值为: ", tag.attrs["class"])
print("href值为: ", tag.attrs["href"])
# 3、获取内容
# 成果:Elsie
print(tag.string)
4、获取子节点信息
'''
获取子节点信息demo4
'''
def demo04(html_doc):
soup = BeautifulSoup(html_doc, 'lxml')
# 1、首要获取head标签的内容部分
# 成果:<head><title>The Dormouse's story</title></head>
print(soup.head)
# 2、然后获取head中title标签的内容
# 成果:<title>The Dormouse's story</title>
print(soup.head.title)
# 3、获取head中title下的文本内容
# 成果:The Dormouse's story
print(soup.head.title.string)
5、相关挑选
1、获取子节点–contents
'''
相关挑选demo05--01--下级节点
运用contents特点进行获取--获取子节点
介绍:
在做挑选的时候,有时候不能做到一步就获取到我想要的节点元素,需求选取某一个节点元素,
然后以这个节点为基准再选取它的子节点、父节点、兄弟节点等
'''
def demo05():
# 留意它的第一个p标签没有换行展示
html_doc01 = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">...</p>
"""
# 留意它和html_doc01的区别在于,p标签进行了换行
html_doc02 = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b>
</p>
<p class="story">...</p>
"""
# 1、获取节点的子节点和后代节点--contents特点
soup01 = BeautifulSoup(html_doc01, "lxml")
# 成果:[<b>The Dormouse's story</b>]
print(soup01.p.contents)
soup02 = BeautifulSoup(html_doc02, "lxml")
# 留意这儿的成果多了一个换行符
# 成果:[<b>The Dormouse's story</b>, '\n']
print(soup02.p.contents)
2、获取子节点–children
'''
相关挑选demo06--02--下级节点
运用children特点进行获取--获取子节点
'''
def demo06():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
# 成果:<list_iterator object at 0x000002B35915BFA0
print(soup.p.children)
# 成果:[
# '\n Once upon a time there were three little sisters; and their names were\n ',
# <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>,
# ',\n ',
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# ' and\n ',
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
# ';\n and they lived at the bottom of a well.\n '
# ]
print(list(soup.p.children))
for item in soup.p.children:
print(item)
3、获取后代节点–descendants
'''
相关挑选demo07--03--下级节点
运用descendants特点进行获取--获取后代节点(获取:子节点和孙节点的内容)
'''
def demo07():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span>Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
# 成果:<generator object Tag.descendants at 0x000001C0E79DCC10>
print(soup.p.descendants)
# 成果:[
# 'Once upon a time there were three little sisters; and their names were\n ',
# <a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span>Elsie</a>,
# <span>Elsie</span>,
# 'Elsie',
# 'Elsie',
# ',\n ',
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# 'Lacie',
# ' and\n ',
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>,
# 'Tillie',
# ';\n and they lived at the bottom of a well.'
# ]
print(list(soup.p.descendants))
# for item in soup.p.descendants:
# print(item)
4、获取父节点–parent、先人节点–parents
'''
相关挑选demo08--01--上级节点
运用parent特点进行获取--获取父节点
运用parents特点进行获取--获取先人节点
'''
def demo08():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">
Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>
<p>
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a>
</p>
</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
# 会打印出<body>标签中一切的内容,包括子节点p标签和孙节点a标签等全部的值
print(soup.p.parent)
# 获取第一个a标签的父节点p标签的值,包括当时的这个a标签中的文本内容
print(soup.a.parent)
print("=======================")
# 成果:<generator object PageElement.parents at 0x000001403E6ECC10>
print(soup.a.parents)
for i, parent in enumerate(soup.a.parents):
print(i, parent)
5、获取兄弟节点
'''
相关挑选demo09--兄弟节点
# 能够运用的特点有:
1、next_sibling
2、previous_sibling
3、next_siblings
4、previous_siblings
'''
def demo09():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>hello
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>
<a href="http://example.com/a" class="sister" id="link3">a</a>
<a href="http://example.com/b" class="sister" id="link3">b</a>
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
# 1、运用next_sibling
# 成果:hello
print(soup.a.next_sibling)
# 2、运用next_siblings
# 成果:<generator object PageElement.next_siblings at 0x00000241CA26CC10>
print(soup.a.next_siblings)
# print(list(soup.a.next_siblings))
# 3、运用previous_sibling
# 成果:Once upon a time there were three little sisters; and their names were
print(soup.a.previous_sibling)
# 4、运用previous_siblings
# <generator object PageElement.previous_siblings at 0x000001F4E6E6CBA0>
print(soup.a.previous_siblings)
# print(list(soup.a.previous_siblings))
6、办法挑选器
1、find_all()
'''
办法挑选器 -- find_all() -- 以列表方式回来多个元素
find_all(name, attrs={}, recursive=True, string, limit)
# 1、name: 标签的称号--查找标签
# 2、attrs: 特点过滤器字典
# 3、recursive: 递归查找一个元素的后代元素们,默以为True
# 4、string:查找文本
# 5、limit: 查找成果的个数约束
'''
def demo10():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1">Elsie</a>,
<a href="http://example.com/lacie" class="sister" id="link2">Lacie</a> and
<a href="http://example.com/tillie" class="sister" id="link3">Tillie</a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
# 1、【根本运用】找到一切的a标签
# 成果:[
# <a class="sister hi" href="http://example.com/elsie" id="link1">Elsie</a>,
# <a class="sister" href="http://example.com/lacie" id="link2">Lacie</a>,
# <a class="sister" href="http://example.com/tillie" id="link3">Tillie</a>
# ]
print(soup.find_all("a"))
# for item in soup.find_all("a"):
# print(item.string)
# 2、【特点查找】依据指定的特点字典进行元素的查找,这儿查找的是class为sister的元素
print(soup.find_all(attrs={"class": "sister"}))
# 效果同上
print(soup.find_all(class_ = "sister"))
# ============这个没有找到成果,需找到原因============
print(soup.find_all(class_ = "hi"))
# 3、【文本查找】查找文本为Elsie的内容
print(soup.find_all(string="Elsie"))
2、find()
'''
办法挑选器 -- find() -- 回来单个元素【一般是回来第一个元素作为成果】
'''
def demo11():
html_doc = """
<html><head><title>The Dormouse's story</title></head>
<body>
<p class="title" name="Dormouse"><b>The Dormouse's story</b></p>
<p class="story">Once upon a time there were three little sisters; and their names were
<a href="http://example.com/elsie" class="sister" id="link1"><span>Elsie</span></a>,
<a href="http://example.com/lacie" class="sister" id="link2"><span>Lacie</span></a> and
<a href="http://example.com/tillie" class="sister" id="link3"><span>Tillie</span></a>;
and they lived at the bottom of a well.</p>
<p class="story">...</p>
"""
soup = BeautifulSoup(html_doc, "lxml")
# 成果:<a class="sister" href="http://example.com/elsie" id="link1"><span>Elsie</span></a>
print(soup.find("a"))
3、其他办法挑选器
'''
其他办法挑选器
find_parents(): 回来所以的先人节点
find_parent(): 回来当时节点的父节点
find_next_siblings():回来当时节点后边的一切兄弟节点
find_previous_siblings():回来当时节点后边的相邻的那个兄弟节点
find_next_sibling():回来当时节点前面的一切兄弟节点
find_previous_sibling():回来当时节点前面的相邻的那个兄弟节点
'''
7、CSS挑选器–select()
'''
CSS挑选器 -- select()办法
'''
def demo12():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, "lxml")
# 1、获取class为panel-heading的节点
# 成果:[<div class="panel-heading">
# <h4>Hello World</h4>
# </div>]
print(soup.select(".panel-heading"))
# 2、获取ul下的li节点
# 成果:[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
print(soup.select("ul li"))
# 3、获取id为list-2下的li节点
# 成果:[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
print(soup.select("#list-2 li"))
# 4、获取一切的ul节点
# 成果:[<ul class="list" id="list-1">
# <li class="element">Foo</li>
# <li class="element">Bar</li>
# <li class="element">Jay</li>
# </ul>, <ul class="list list-samll" id="list-2">
# <li class="element">Foo</li>
# <li class="element">Bar</li>
# <li class="element">Jay</li>
# </ul>]
print(soup.select("ul"))
# 成果:<class 'bs4.element.Tag'>
print(type(soup.select('ul')[0]))
阐明:
# 1、查询一切的后代节点
在 select(css)中的 css 有多个节点时,节点元素之间用空格分开,便是查找后代节点,
例如 soup.select(“div p”)是查找一切<div>节点下面的一切后代<p>节点。
# 2、只查直接的子节点,不查孙节点
节点元素之间用" > "分开(留意>的前后至少包括一个空格),便是查找直接子节点:
# 例如 soup.select(“div > p”)是查找一切<div>节点下面的一切直接子节点<p>,不包括孙节点。
# 3、查找某个节点同级其他某类节点
用" ~ "衔接两个节点表明查找前一个节点后边的一切同级其他兄弟节点(留意~号前后至少有一个空格),
例如 soup.select(“div ~ p”)查找<div>后边的一切同级其他<p>兄弟节点。
# 4、查找同等级某个节点后的第一个某类节点
用" + "衔接两个节点表明查找前一个节点后边的第一个同级其他兄弟节点(留意+号前后至少有一个空格):
例如 soup.select(“div + p”)查找<div>后边的第一个同级其他<p>兄弟节点。
8、嵌套挑选–select()
'''
嵌套挑选 -- select( )办法
'''
def demo13():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, 'lxml')
# 运转成果:[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
# [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>]
for ul in soup.select('ul'):
print(ul.select('li'))
9、获取特点
'''
获取特点(两种办法)
'''
def demo14():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, 'lxml')
for ul in soup.select('ul'):
print(ul['id'])
print(ul.attrs['id'])
10、获取文本
'''
获取文本(两种方式)
'''
def demo15():
html_doc = """
<div class="panel">
<div class="panel-heading">
<h4>Hello World</h4>
</div>
<div class="panel-body">
<ul class="list" id="list-1">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
<ul class="list list-samll" id="list-2">
<li class="element">Foo</li>
<li class="element">Bar</li>
<li class="element">Jay</li>
</ul>
</div>
</div>
</div>
"""
soup = BeautifulSoup(html_doc, 'lxml')
for li in soup.select('li'):
print('String:', li.string)
print('get text:', li.get_text())
参阅链接
1、Python爬虫:史上最详细的BeautifulSoup教程