基于Python實(shí)現(xiàn)新聞爬取系統(tǒng)
新聞爬取系統(tǒng)
信息展示:tkinter
爬取及請求:requests、BeautifulSoup
設(shè)置新聞列表API
打開騰訊新聞網(wǎng)頁->鼠標(biāo)右鍵檢查/鍵盤F12鍵->網(wǎng)絡(luò)->刷新一下頁面
然后右鍵復(fù)制鏈接地址即是
程序運(yùn)行效果
文件寫入內(nèi)容
參考coding部分-兩個(gè)文件
注意設(shè)置本地文件路徑?。。?!
數(shù)據(jù)爬取文件Myspider_news.py
import requests from bs4 import BeautifulSoup class MySpider: def __init__(self): self.ulist = [] def getResponse(self, url): headers = { 'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) ' 'Chrome/111.0.0.0 Safari/537.36 Edg/111.0.1661.54'} r = requests.get(url, timeout=30, headers=headers) r.raise_for_status() r.encoding = r.apparent_encoding return r def getJSONText(self, r): ulist = [] data = r.json() news = data['data']['list'] for n in news: title = n['title'] publish_time = n['publish_time'] href = n['url'] ulist.append([title, publish_time, href]) self.ulist = ulist return ulist def writeFile(self, file='data.txt'): print("ulist", self.ulist) with open(file, "w", encoding='utf-8') as f: for i, item in enumerate(self.ulist): f.write(f"{i}::{item[0]}::{item[1]}::{item[2]}\n") def getNewsContent(self, r): data = '' soup = BeautifulSoup(r.text, 'lxml') datas = soup.select('div#ArticleContent>p.one-p') title = soup.select("h1")[0].get_text() for d in datas: data += d.get_text() + "\n" return title, data
窗口展示文件MySpiderGui_news.py
from tkinter import * from tkinter import messagebox from Myspider_news import * class MySpiderGUI_news: def __init__(self): self.window = Tk() self.window.title("新聞爬取") Label(self.window, text="騰訊新聞", font=("黑體", 26, 'bold')).pack() f1 = Frame(self.window) f1.pack(fill="both") Label(f1, text="請輸入網(wǎng)址:", font=('黑體', 12)).pack(side="left") self.url = StringVar() # self.url.set("") # Entry(f1, textvariable=self.url).pack(side="left", fill="x", expand=1) self.url.set("https://i.news.qq.com/trpc.qqnews_web.kv_srv.kv_srv_http_proxy/" "list?sub_srv_id=24hours&srv_id=pc&offset=0&limit=20&strategy=1&ext=" "{%22pool%22:[%22top%22,%22hot%22],%22is_filter%22:7,%22check_type%22:true}") Entry(f1, text="顯示數(shù)量: ", font=('黑體', 12)).pack(side="left") self.num = IntVar() Entry(f1, textvariable=self.num).pack(side="left") Button(f1, text="確定", command=self.btOK, padx=10).pack(side="left") Button(f1, text="清空", command=self.btCancel, padx=10).pack(side="left") f2 = Frame(self.window) f2.pack(fill="both", expand=1) scrollbarx = Scrollbar(f2, orient="horizontal") scrollbary = Scrollbar(f2, orient="vertical") scrollbarx.pack(side="bottom", fill=X) scrollbary.pack(side="right", fill=Y) self.text = Text(f2, wrap='none', width=60, xscrollcommand=scrollbarx.set, yscrollcommand=scrollbary.set) scrollbarx.config(command=self.text.xview) scrollbary.config(command=self.text.yview) self.text.pack(fill="both", expand=1) Label(f2, text="新聞id :", font=('黑體', 12)).pack(side="left") self.news_id = IntVar() Entry(f2, textvariable=self.news_id).pack(side="left") Button(f2, text="顯示新聞", command=self.btNews, padx=10).pack(side="left") self.file = "" self.window.mainloop() def btOK(self): self.text.delete(1.0, END) # tplt = "{0:^5} {1:{3}^18} {2:< 10}" tplt = "{0:^5} {1:{3}^18} {2:<10}" self.text.insert('end', tplt.format("序 號", "新 聞", "時(shí) 間", chr(12288))) self.text.insert('end', '\n') if self.num.get() > 20: messagebox.showerror("錯(cuò)誤", "輸入的新聞數(shù)太多啦") return ulist = [] messagebox.showinfo("提示", "開始爬取。。。") url = self.url.get() spider = MySpider() try: r = spider.getResponse(url) ulist = spider.getJSONText(r) self.file = r"G:\(你本地的文件路徑)test-file\data.txt" spider.writeFile(self.file) except Exception as ex: print("程序出錯(cuò):", ex) for i in range(self.num.get()): # print(self.num.get()) u = ulist[i] if len(u[0]) > 15: u[0] = u[0][:15] u[0] = self.strB2Q(u[0]) u[0] = u[0] + "..." else: u[0] = self.strB2Q(u[0]) u[0] = u[0] + "..." + chr(12288) * (15 - len(u[0])) if len(u[1]) > 10: u[1] = u[1][:10] # print(u[1]) tplt = "{0:^5} {1:^18} {2:<10}" self.text.insert('end', tplt.format(str(i), u[0], u[1])) self.text.insert('end', "\n") self.text.insert('end', "\n") self.text.insert('end', "共有記錄" + str(self.num.get()) + "條") self.text.insert('end', '\n') def btCancel(self): self.num.set(0) self.text.delete(1.0, END) tplt = "{0:^2} {1:{3}^18} {2:<10}" self.text.insert("end", tplt.format("序號", "新聞", "時(shí)間", chr(1288))) self.text.insert('end', '\n') def btNews(self): root = Tk() root.title("顯示新聞") self.lbltitle = Label(root, text=" ", font=('黑體', 22, 'bold')) self.lbltitle.pack() f1 = Frame(root) f1.pack(fill="both", expand=1) scrollbarx = Scrollbar(f1, orient="horizontal") scrollbary = Scrollbar(f1, orient="vertical") scrollbarx.pack(side="bottom", fill=X) scrollbary.pack(side="right", fill=Y) self.news_text = Text(f1, wrap="none", width=60, height=10, xscrollcommand=scrollbarx.set, yscrollcommand=scrollbary.set) scrollbarx.config(command=self.text.xview) scrollbary.config(command=self.text.yview) self.news_text.pack(fill="both", expand=1) Button(f1, text="關(guān)閉窗口", command=root.destroy, padx=10).pack() self.displayNews() root.mainloop() def displayNews(self): f = open(self.file, "r", encoding='utf-8') datas = f.readlines()[self.news_id.get()] # 讀取特定行 data = datas.split("::", 4) news_url = data[3] title = "" content = "" newsSpider = MySpider() try: r = newsSpider.getResponse(news_url) title, content = newsSpider.getNewsContent(r) except Exception as ex: print("程序出錯(cuò): ", ex) self.lbltitle["text"] = title self.news_text.insert('end', "標(biāo)題: " + title) self.news_text.insert('end', "\n") self.news_text.insert('end', "內(nèi)容: ") self.news_text.insert('end', content) self.news_text.insert('end', "n") def strB2Q(self, ustring): rstring = "" for uchar in ustring: inside_code = ord(uchar) if inside_code == 32: # 空格轉(zhuǎn)換 inside_code = 12288 elif 32 <= inside_code <= 126: # 半 角范圍 inside_code += 65248 rstring += chr(inside_code) return rstring MySpiderGUI_news()
注意設(shè)置本地文件路徑!?。?!
以上就是基于Python實(shí)現(xiàn)新聞爬取系統(tǒng)的詳細(xì)內(nèi)容,更多關(guān)于Python新聞爬取的資料請關(guān)注腳本之家其它相關(guān)文章!
相關(guān)文章
Keras使用ImageNet上預(yù)訓(xùn)練的模型方式
這篇文章主要介紹了Keras使用ImageNet上預(yù)訓(xùn)練的模型方式,具有很好的參考價(jià)值,希望對大家有所幫助。一起跟隨小編過來看看吧2020-05-05Python Pandas 對列/行進(jìn)行選擇,增加,刪除操作
這篇文章主要介紹了Python Pandas 對列/行進(jìn)行選擇,增加,刪除操作,文中通過示例代碼介紹的非常詳細(xì),對大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來一起學(xué)習(xí)學(xué)習(xí)吧2020-05-05Jupyter notebook 不自動(dòng)彈出網(wǎng)頁的解決方案
這篇文章主要介紹了Jupyter notebook 不自動(dòng)彈出網(wǎng)頁的解決方案,具有很好的參考價(jià)值,希望對大家有所幫助。如有錯(cuò)誤或未考慮完全的地方,望不吝賜教2021-05-05python3 selenium自動(dòng)化 下拉框定位的例子
今天小編就為大家分享一篇python3 selenium自動(dòng)化 下拉框定位的例子,具有很好的參考價(jià)值,希望對大家有所幫助。一起跟隨小編過來看看吧2019-08-08python實(shí)現(xiàn)圖像的隨機(jī)增強(qiáng)變換
這篇文章主要為大家介紹了如何利用pythons制作一個(gè)小工具工具,可以實(shí)現(xiàn)圖像的隨機(jī)增強(qiáng)變換,可用于分類訓(xùn)練數(shù)據(jù)的增強(qiáng),有需要的可以參考下2024-11-11Python實(shí)現(xiàn)曲線的肘部點(diǎn)檢測詳解
肘部法則是經(jīng)常使用的法則。很多時(shí)候,可以憑人工經(jīng)驗(yàn)去找最優(yōu)拐點(diǎn),但有時(shí)需要自動(dòng)尋找拐點(diǎn)。本文為大家介紹了Python實(shí)現(xiàn)曲線的肘部點(diǎn)檢測的方法,希望對大家有所幫助2023-02-02