基于python實(shí)現(xiàn)垂直爬蟲(chóng)系統(tǒng)的方法詳解
html_downloader
from urllib import request def download(url): if url is None: return response = request.urlopen(url) if response.getcode() != 200: return None return response.read()
html_outeputer
data_list = [] def collect_data(data): data_list.append(data) def output_html(): fout = open('output.html', 'w') fout.write('<html>') fout.write('<body>') fout.write('<table>') for dataitem in data_list: fout.write('<tr>') fout.write('<td>%s</td>' % dataitem['url']) fout.write('<td>%s</td>' % dataitem['title']) fout.write('<td>%s</td>' % dataitem['datetime']) fout.write('<td>%s</td>' % dataitem['visitcount']) fout.write('</tr>') fout.write('</table>') fout.write('</body>') fout.write('</html>') fout.close()
html_parser
import re from bs4 import BeautifulSoup from urllib.parse import urljoin def get_new_urls(page_url, soup): new_urls = set() links = soup.find_all('a', href=re.compile(r"/\d+/\d+/\w+/page\.htm")) for link in links: new_url = link['href'] new_full_url = urljoin(page_url, new_url) new_urls.add(new_full_url) return new_urls def get_new_data(page_url, soup): res_data = {} title_node = soup.find('h1', class_='arti-title') if title_node is None: return res_data res_data['title'] = title_node.get_text() datetime_node = soup.find('span', class_='arti-update') res_data['datetime'] = datetime_node.get_text() visitcount_node = soup.find('span', class_='WP_VisitCount') res_data['visitcount'] = visitcount_node.get_text() res_data['url'] = page_url return res_data def parse(page_url, html_cont): if page_url is None or html_cont is None: return soup = BeautifulSoup(html_cont, 'html.parser', from_encoding='utf-8') new_urls = get_new_urls(page_url, soup) new_data = get_new_data(page_url, soup) return new_urls, new_data
spider_main
import urls_manager, html_downloader, \ html_parser, html_outputer def craw(root_url): count = 1 urls_manager.add_new_url(root_url) #啟動(dòng)爬蟲(chóng)循環(huán) while urls_manager.has_new_url(): new_url = urls_manager.get_new_url() print('craw %d : %s' % (count, new_url)) html_cont = html_downloader.download(new_url) new_urls, new_data = html_parser.parse(new_url, html_cont) urls_manager.add_new_urls(new_urls) if new_data: html_outputer.collect_data(new_data) if count == 10: break count = count + 1 html_outputer.output_html() if __name__ == '__main__': root_url = 'http://news.zzuli.edu.cn/' craw(root_url) import urls_manager, html_downloader, \ html_parser, html_outputer def craw(root_url): count = 1 urls_manager.add_new_url(root_url) #啟動(dòng)爬蟲(chóng)循環(huán) while urls_manager.has_new_url(): new_url = urls_manager.get_new_url() print('craw %d : %s' % (count, new_url)) html_cont = html_downloader.download(new_url) new_urls, new_data = html_parser.parse(new_url, html_cont) urls_manager.add_new_urls(new_urls) if new_data: html_outputer.collect_data(new_data) if count == 10: break count = count + 1 html_outputer.output_html() if __name__ == '__main__': root_url = 'http://news.zzuli.edu.cn/' craw(root_url)
test_64
from bs4 import BeautifulSoup import re html_doc = """ <html><head><title>The Dormouse's story</title></head> <body> <p class="title"><b>The Dormouse's story</b></p> <p class="story">Once upon a time there were three little sisters; and their names were <a class="sister" id="link1">Elsie</a>, <a class="sister" id="link2">Lacie</a> and <a class="sister" id="link3">Tillie</a>; and they lived at the bottom of a well.</p> <p class="story">...</p> """ soup = BeautifulSoup(html_doc, 'html.parser') print('獲取所有鏈接') links = soup.find_all('a') for link in links: print(link.name, link['href'], link.get_text()) print('獲取lacie鏈接') link_node = soup.find('a', ) print(link_node.name, link_node['href'], link_node.get_text()) print('正則匹配') link_node = soup.find('a', href=re.compile(r'ill')) print(link_node.name, link_node['href'], link_node.get_text()) print('獲取P段落文字') p_node = soup.find('p', class_='title') print(p_node.name, p_node.get_text())
urls_manager
new_urls = set() old_urls = set() def add_new_url(url): if url is None: return if url not in new_urls and url not in old_urls: new_urls.add(url) def add_new_urls(urls): if urls is None or len(urls) == 0: return for url in urls: add_new_url(url) def get_new_url(): new_url = new_urls.pop() old_urls.add(new_url) return new_url def has_new_url(): return len(new_urls) != 0
總結(jié)
本篇文章就到這里了,希望能夠給你帶來(lái)幫助,也希望您能夠多多關(guān)注腳本之家的更多內(nèi)容!
相關(guān)文章
matplotlib實(shí)現(xiàn)矩陣和圖像的可視化表示
這篇文章主要為大家詳細(xì)介紹了如何利用matplotlib實(shí)現(xiàn)矩陣和圖像的可視化表示,文中的示例代碼講解詳細(xì),具有一定的學(xué)習(xí)價(jià)值,感興趣的小伙伴可以了解下2024-03-03如何使用 Pylint 來(lái)規(guī)范 Python 代碼風(fēng)格(來(lái)自IBM)
本文通過(guò)詳細(xì)的理論介紹和簡(jiǎn)單易懂的實(shí)例全面介紹了 Python 代碼分析工具 Pylint。相信讀者看完后一定可以輕松地將 Pylint 運(yùn)用到自己的開(kāi)發(fā)工程中2018-04-04pytorch 中forward 的用法與解釋說(shuō)明
這篇文章主要介紹了pytorch 中forward 的用法與解釋說(shuō)明,具有很好的參考價(jià)值,希望對(duì)大家有所幫助。一起跟隨小編過(guò)來(lái)看看吧2021-02-02Python使用windows設(shè)置定時(shí)執(zhí)行腳本
這篇文章主要介紹了Python使用windows設(shè)置定時(shí)執(zhí)行腳本,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友可以參考下2020-11-11python中字符串比較使用is、==和cmp()總結(jié)
在Python中比較字符串最好是使用簡(jiǎn)單邏輯操作符,今天為大家講解一下is、==和cmp()使用總結(jié)2018-03-03