python BeautifulSoup庫(kù)的常用操作
BeautifulSoup庫(kù)
0、所有方法都有的
from bs4 import BeautifulSoup # 前面幾個(gè)方法使用的都是這個(gè)參數(shù),所以統(tǒng)一使用這個(gè)(后面的那些方法沒有引用這個(gè)html文本文件) html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a>, <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """
1、基本用法
''' 基本用法demo1 ''' def demo01(html_doc): # 這里的作用是將html_doc中缺少的標(biāo)簽補(bǔ)充完善,使用的庫(kù)是lxml進(jìn)行補(bǔ)全 soup = BeautifulSoup(html_doc, "lxml") # 更正html_doc的格式,使得上面文本的格式是正確的 print(soup.prettify()) # 查看經(jīng)過(guò)上面步驟處理過(guò)后的結(jié)果 print(soup.title.string)
2、節(jié)點(diǎn)選擇器
''' 節(jié)點(diǎn)選擇器demo2 ''' def demo02(html_doc): soup = BeautifulSoup(html_doc, 'lxml') # 選擇html_doc中的title標(biāo)簽 # 結(jié)果:<title>The Dormouse's story</title> print(soup.title) # 查看對(duì)應(yīng)的類型 # 結(jié)果:<class 'bs4.element.Tag'> print(type(soup.title)) # 結(jié)果:The Dormouse's story print(soup.title.string) # 結(jié)果:<head><title>The Dormouse's story</title></head> print(soup.head) # 結(jié)果:<p class="title"><b>The Dormouse's story</b></p> print(soup.p) # 結(jié)果:<class 'bs4.element.Tag'> print(type(soup.p)) # 結(jié)果:<a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link1">Elsie</a> 【默認(rèn)返回第一個(gè)】 print(soup.a)
3、提取節(jié)點(diǎn)信息
''' 提取節(jié)點(diǎn)信息demo3 ''' def demo03(html_doc): soup = BeautifulSoup(html_doc, "lxml") # <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a> tag = soup.a # 1、獲取名稱 # 結(jié)果:a print(tag.name) # 2、獲取屬性值 # 結(jié)果: # class值為: ['sister'] # href值為: http://example.com/elsie print("class值為: ", tag.attrs["class"]) print("href值為: ", tag.attrs["href"]) # 3、獲取內(nèi)容 # 結(jié)果:Elsie print(tag.string)
4、獲取子節(jié)點(diǎn)信息
''' 獲取子節(jié)點(diǎn)信息demo4 ''' def demo04(html_doc): soup = BeautifulSoup(html_doc, 'lxml') # 1、首先獲取head標(biāo)簽的內(nèi)容部分 # 結(jié)果:<head><title>The Dormouse's story</title></head> print(soup.head) # 2、然后獲取head中title標(biāo)簽的內(nèi)容 # 結(jié)果:<title>The Dormouse's story</title> print(soup.head.title) # 3、獲取head中title下的文本內(nèi)容 # 結(jié)果:The Dormouse's story print(soup.head.title.string)
5、關(guān)聯(lián)選擇
1、獲取子節(jié)點(diǎn)--contents
''' 關(guān)聯(lián)選擇demo05--01--下級(jí)節(jié)點(diǎn) 使用contents屬性進(jìn)行獲取--獲取子節(jié)點(diǎn) 介紹: 在做選擇的時(shí)候,有時(shí)候不能做到一步就獲取到我想要的節(jié)點(diǎn)元素,需要選取某一個(gè)節(jié)點(diǎn)元素, 然后以這個(gè)節(jié)點(diǎn)為基準(zhǔn)再選取它的子節(jié)點(diǎn)、父節(jié)點(diǎn)、兄弟節(jié)點(diǎn)等 ''' def demo05(): # 注意它的第一個(gè)p標(biāo)簽沒有換行展示 html_doc01 = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="Dormouse"><b>The Dormouse's story</b></p> <p class="story">...</p> """ # 注意它和html_doc01的區(qū)別在于,p標(biāo)簽進(jìn)行了換行 html_doc02 = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="Dormouse"><b>The Dormouse's story</b> </p> <p class="story">...</p> """ # 1、獲取節(jié)點(diǎn)的子節(jié)點(diǎn)和子孫節(jié)點(diǎn)--contents屬性 soup01 = BeautifulSoup(html_doc01, "lxml") # 結(jié)果:[<b>The Dormouse's story</b>] print(soup01.p.contents) soup02 = BeautifulSoup(html_doc02, "lxml") # 注意這里的結(jié)果多了一個(gè)換行符 # 結(jié)果:[<b>The Dormouse's story</b>, '\n'] print(soup02.p.contents)
2、獲取子節(jié)點(diǎn)--children
''' 關(guān)聯(lián)選擇demo06--02--下級(jí)節(jié)點(diǎn) 使用children屬性進(jìn)行獲取--獲取子節(jié)點(diǎn) ''' def demo06(): html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a>, <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well. </p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, "lxml") # 結(jié)果:<list_iterator object at 0x000002B35915BFA0 print(soup.p.children) # 結(jié)果:[ # '\n Once upon a time there were three little sisters; and their names were\n ', # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link1">Elsie</a>, # ',\n ', # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link2">Lacie</a>, # ' and\n ', # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link3">Tillie</a>, # ';\n and they lived at the bottom of a well.\n ' # ] print(list(soup.p.children)) for item in soup.p.children: print(item)
3、獲取子孫節(jié)點(diǎn)--descendants
''' 關(guān)聯(lián)選擇demo07--03--下級(jí)節(jié)點(diǎn) 使用descendants屬性進(jìn)行獲取--獲取子孫節(jié)點(diǎn)(獲取:子節(jié)點(diǎn)和孫節(jié)點(diǎn)的內(nèi)容) ''' def demo07(): html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="story">Once upon a time there were three little sisters; and their names were <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1"><span>Elsie</span>Elsie</a>, <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, "lxml") # 結(jié)果:<generator object Tag.descendants at 0x000001C0E79DCC10> print(soup.p.descendants) # 結(jié)果:[ # 'Once upon a time there were three little sisters; and their names were\n ', # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link1"><span>Elsie</span>Elsie</a>, # <span>Elsie</span>, # 'Elsie', # 'Elsie', # ',\n ', # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link2">Lacie</a>, # 'Lacie', # ' and\n ', # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link3">Tillie</a>, # 'Tillie', # ';\n and they lived at the bottom of a well.' # ] print(list(soup.p.descendants)) # for item in soup.p.descendants: # print(item)
4、獲取父節(jié)點(diǎn)--parent、祖先節(jié)點(diǎn)--parents
''' 關(guān)聯(lián)選擇demo08--01--上級(jí)節(jié)點(diǎn) 使用parent屬性進(jìn)行獲取--獲取父節(jié)點(diǎn) 使用parents屬性進(jìn)行獲取--獲取祖先節(jié)點(diǎn) ''' def demo08(): html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="story"> Once upon a time there were three little sisters; and their names were <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a> <p> <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> </p> </p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, "lxml") # 會(huì)打印出<body>標(biāo)簽中所有的內(nèi)容,包括子節(jié)點(diǎn)p標(biāo)簽和孫節(jié)點(diǎn)a標(biāo)簽等全部的值 print(soup.p.parent) # 獲取第一個(gè)a標(biāo)簽的父節(jié)點(diǎn)p標(biāo)簽的值,包括當(dāng)前的這個(gè)a標(biāo)簽中的文本內(nèi)容 print(soup.a.parent) print("=======================") # 結(jié)果:<generator object PageElement.parents at 0x000001403E6ECC10> print(soup.a.parents) for i, parent in enumerate(soup.a.parents): print(i, parent)
5、獲取兄弟節(jié)點(diǎn)
''' 關(guān)聯(lián)選擇demo09--兄弟節(jié)點(diǎn) # 可以使用的屬性有: 1、next_sibling 2、previous_sibling 3、next_siblings 4、previous_siblings ''' def demo09(): html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="story">Once upon a time there were three little sisters; and their names were <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a>hello <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a> <a rel="external nofollow" class="sister" id="link3">a</a> <a rel="external nofollow" class="sister" id="link3">b</a> and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, "lxml") # 1、使用next_sibling # 結(jié)果:hello print(soup.a.next_sibling) # 2、使用next_siblings # 結(jié)果:<generator object PageElement.next_siblings at 0x00000241CA26CC10> print(soup.a.next_siblings) # print(list(soup.a.next_siblings)) # 3、使用previous_sibling # 結(jié)果:Once upon a time there were three little sisters; and their names were print(soup.a.previous_sibling) # 4、使用previous_siblings # <generator object PageElement.previous_siblings at 0x000001F4E6E6CBA0> print(soup.a.previous_siblings) # print(list(soup.a.previous_siblings))
6、方法選擇器
1、find_all()
''' 方法選擇器 -- find_all() -- 以列表形式返回多個(gè)元素 find_all(name, attrs={}, recursive=True, string, limit) # 1、name: 標(biāo)簽的名稱--查找標(biāo)簽 # 2、attrs: 屬性過(guò)濾器字典 # 3、recursive: 遞歸查找一個(gè)元素的子孫元素們,默認(rèn)為True # 4、string:查找文本 # 5、limit: 查找結(jié)果的個(gè)數(shù)限制 ''' def demo10(): html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="Dormouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1">Elsie</a>, <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2">Lacie</a> and <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, "lxml") # 1、【基本使用】找到所有的a標(biāo)簽 # 結(jié)果:[ # <a class="sister hi" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link1">Elsie</a>, # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link2">Lacie</a>, # <a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link3">Tillie</a> # ] print(soup.find_all("a")) # for item in soup.find_all("a"): # print(item.string) # 2、【屬性查找】根據(jù)指定的屬性字典進(jìn)行元素的查找,這里查找的是class為sister的元素 print(soup.find_all(attrs={"class": "sister"})) # 效果同上 print(soup.find_all(class_ = "sister")) # ============這個(gè)沒有找到結(jié)果,需找到原因============ print(soup.find_all(class_ = "hi")) # 3、【文本查找】查找文本為Elsie的內(nèi)容 print(soup.find_all(string="Elsie"))
2、find()
''' 方法選擇器 -- find() -- 返回單個(gè)元素【一般是返回第一個(gè)元素作為結(jié)果】 ''' def demo11(): html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title" name="Dormouse"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link1"><span>Elsie</span></a>, <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link2"><span>Lacie</span></a> and <a rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" class="sister" id="link3"><span>Tillie</span></a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, "lxml") # 結(jié)果:<a class="sister" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" rel="external nofollow" id="link1"><span>Elsie</span></a> print(soup.find("a"))
3、其他方法選擇器
''' 其他方法選擇器 find_parents(): 返回所以的祖先節(jié)點(diǎn) find_parent(): 返回當(dāng)前節(jié)點(diǎn)的父節(jié)點(diǎn) find_next_siblings():返回當(dāng)前節(jié)點(diǎn)后面的所有兄弟節(jié)點(diǎn) find_previous_siblings():返回當(dāng)前節(jié)點(diǎn)后面的相鄰的那個(gè)兄弟節(jié)點(diǎn) find_next_sibling():返回當(dāng)前節(jié)點(diǎn)前面的所有兄弟節(jié)點(diǎn) find_previous_sibling():返回當(dāng)前節(jié)點(diǎn)前面的相鄰的那個(gè)兄弟節(jié)點(diǎn) '''
7、CSS選擇器--select()
''' CSS選擇器 -- select()方法 ''' def demo12(): html_doc = """ <div class="panel"> <div class="panel-heading"> <h4>Hello World</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-samll" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> </div> </div> </div> """ soup = BeautifulSoup(html_doc, "lxml") # 1、獲取class為panel-heading的節(jié)點(diǎn) # 結(jié)果:[<div class="panel-heading"> # <h4>Hello World</h4> # </div>] print(soup.select(".panel-heading")) # 2、獲取ul下的li節(jié)點(diǎn) # 結(jié)果:[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>, <li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] print(soup.select("ul li")) # 3、獲取id為list-2下的li節(jié)點(diǎn) # 結(jié)果:[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] print(soup.select("#list-2 li")) # 4、獲取所有的ul節(jié)點(diǎn) # 結(jié)果:[<ul class="list" id="list-1"> # <li class="element">Foo</li> # <li class="element">Bar</li> # <li class="element">Jay</li> # </ul>, <ul class="list list-samll" id="list-2"> # <li class="element">Foo</li> # <li class="element">Bar</li> # <li class="element">Jay</li> # </ul>] print(soup.select("ul")) # 結(jié)果:<class 'bs4.element.Tag'> print(type(soup.select('ul')[0]))
說(shuō)明:
# 1、查詢所有的子孫節(jié)點(diǎn)
在 select(css)中的 css 有多個(gè)節(jié)點(diǎn)時(shí),節(jié)點(diǎn)元素之間用空格分開,就是查找子孫節(jié)點(diǎn),
例如 soup.select(“div p”)是查找所有<div>節(jié)點(diǎn)下面的所有子孫<p>節(jié)點(diǎn)。# 2、只查直接的子節(jié)點(diǎn),不查孫節(jié)點(diǎn)
節(jié)點(diǎn)元素之間用" > "分開(注意>的前后至少包含一個(gè)空格),就是查找直接子節(jié)點(diǎn):
# 例如 soup.select(“div > p”)是查找所有<div>節(jié)點(diǎn)下面的所有直接子節(jié)點(diǎn)<p>,不包含孫節(jié)點(diǎn)。# 3、查找某個(gè)節(jié)點(diǎn)同級(jí)別的某類節(jié)點(diǎn)
用" ~ "連接兩個(gè)節(jié)點(diǎn)表示查找前一個(gè)節(jié)點(diǎn)后面的所有同級(jí)別的兄弟節(jié)點(diǎn)(注意~號(hào)前后至少有一個(gè)空格),
例如 soup.select(“div ~ p”)查找<div>后面的所有同級(jí)別的<p>兄弟節(jié)點(diǎn)。# 4、查找同級(jí)別某個(gè)節(jié)點(diǎn)后的第一個(gè)某類節(jié)點(diǎn)
用" + "連接兩個(gè)節(jié)點(diǎn)表示查找前一個(gè)節(jié)點(diǎn)后面的第一個(gè)同級(jí)別的兄弟節(jié)點(diǎn)(注意+號(hào)前后至少有一個(gè)空格):
例如 soup.select(“div + p”)查找<div>后面的第一個(gè)同級(jí)別的<p>兄弟節(jié)點(diǎn)。
8、嵌套選擇--select()
''' 嵌套選擇 -- select( )方法 ''' def demo13(): html_doc = """ <div class="panel"> <div class="panel-heading"> <h4>Hello World</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-samll" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> </div> </div> </div> """ soup = BeautifulSoup(html_doc, 'lxml') # 運(yùn)行結(jié)果:[<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] # [<li class="element">Foo</li>, <li class="element">Bar</li>, <li class="element">Jay</li>] for ul in soup.select('ul'): print(ul.select('li'))
9、獲取屬性
''' 獲取屬性(兩種方法) ''' def demo14(): html_doc = """ <div class="panel"> <div class="panel-heading"> <h4>Hello World</h4> </div> <div class="panel-body"> <ul class="list" id="list-1"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> <ul class="list list-samll" id="list-2"> <li class="element">Foo</li> <li class="element">Bar</li> <li class="element">Jay</li> </ul> </div> </div> </div> """ soup = BeautifulSoup(html_doc, 'lxml') for ul in soup.select('ul'): print(ul['id']) print(ul.attrs['id'])
以上就是python BeautifulSoup庫(kù)的常用操作的詳細(xì)內(nèi)容,更多關(guān)于python BeautifulSoup的資料請(qǐng)關(guān)注腳本之家其它相關(guān)文章!
相關(guān)文章
Python中__new__與__init__方法的區(qū)別詳解
這篇文章主要介紹了Python中__new__與__init__方法的區(qū)別,是Python學(xué)習(xí)中的基礎(chǔ)知識(shí),需要的朋友可以參考下2015-05-05python opencv實(shí)現(xiàn)旋轉(zhuǎn)矩形框裁減功能
這篇文章主要為大家詳細(xì)介紹了python opencv實(shí)現(xiàn)旋轉(zhuǎn)矩形框裁減功能,具有一定的參考價(jià)值,感興趣的小伙伴們可以參考一下2018-07-07pycharm使用Translation插件實(shí)現(xiàn)翻譯功能
PyCharm是一款很流行的Python編輯器,經(jīng)常遇到在PyCharm中把中文翻譯成英文的需求,下面這篇文章主要給大家介紹了關(guān)于pycharm使用Translation插件實(shí)現(xiàn)翻譯功能的相關(guān)資料,需要的朋友可以參考下2023-05-05Python實(shí)現(xiàn)隨機(jī)森林回歸與各自變量重要性分析與排序
這篇文章主要為大家詳細(xì)介紹了在Python環(huán)境中,實(shí)現(xiàn)隨機(jī)森林(Random Forest,RF)回歸與各自變量重要性分析與排序的過(guò)程,感興趣的小伙伴可以了解一下2023-02-02