python根據(jù)文章標題內(nèi)容自動生成摘要的實例
更新時間:2019年02月21日 10:16:08 作者:周小董
今天小編就為大家分享一篇python根據(jù)文章標題內(nèi)容自動生成摘要的實例,具有很好的參考價值,希望對大家有所幫助。一起跟隨小編過來看看吧
text.py
方法一:根據(jù)標題內(nèi)容生成摘要
# -*- coding: utf-8 -*- import jieba,copy,re,codecs from collections import Counter from text import title,text class Summary(): #**** 切分句子 ************ def cutSentence(self,text): sents = [] text = re.sub(r'\n+','。',text) # 換行改成句號(標題段無句號的情況) text = text.replace('。。','。') # 刪除多余的句號 text = text.replace('?。','。') # text = text.replace('!。','。') # 刪除多余的句號 sentences = re.split(r'。|!|?|】|;',text) # 分句 #print(sentences) sentences = sentences[:-1] # 刪除最后一個句號后面的空句 for sent in sentences: len_sent = len(sent) if len_sent < 4: # 刪除換行符、一個字符等 continue # sent = sent.decode('utf8') sent = sent.strip(' ') sent = sent.lstrip('【') sents.append(sent) return sents #**** 提取特征詞 ********************** def getKeywords(self,title,sentences,n=10): words = [] #**** 分詞,獲取詞匯列表 ***** # split_result = pseg.cut(text) for sentence in sentences: split_result = jieba.cut(sentence) for i in split_result: words.append(i) #**** 統(tǒng)計詞頻TF ***** c = Counter(words) # 詞典 #**** 去除停用詞(為了提高效率,該步驟放到統(tǒng)計詞頻之后) self.delStopwords(c) #**** 標題中提取特征 ********* words_title = [word for word in jieba.cut(title,cut_all=True)] self.delStopwords(words_title) #**** 獲取topN ************ topN = c.most_common(n) # for i in topN: # print(i[0],i[1]) words_topN = [i[0] for i in topN if i[1]>1] #在topN中排除出現(xiàn)次數(shù)少于2次的詞 words_topN = list(set(words_topN)|set(words_title)) # 正文關鍵詞與標題關鍵詞取并集 print (' '.join(words_topN)) return words_topN #**** 去除停用詞 ******************************* def delStopwords(self,dict): sw_file = codecs.open('stopwords.txt',encoding='utf8') stop_words = [] for line in sw_file.readlines(): stop_words.append(line.strip()) #***** 輸入?yún)?shù)為list ************* # if type(dict) is types.ListType: if type(dict) is list: words = dict for word in words: if word in stop_words: words.remove(word) #***** 輸入?yún)?shù)type為 <class 'collections.Counter'> ***** else: words = copy.deepcopy(list(dict.keys())) for word in words: if word in stop_words: del dict[word] return words #**** 提取topN句子 ********************** def getTopNSentences(self,sentences,keywords,n=3): sents_score = {} len_sentences = len(sentences) #**** 初始化句子重要性得分,并計算句子平均長度 len_avg = 0 len_min = len(sentences[0]) len_max = len(sentences[0]) for sent in sentences: sents_score[sent] = 0 l = len(sent) len_avg += l if len_min > l: len_min = l if len_max < l: len_max = l len_avg = len_avg / len_sentences # print(len_min,len_avg,len_max) #**** 計算句子權重得分 ********** for sent in sentences: #**** 不考慮句長在指定范圍外的句子 ****** l = len(sent) if l < (len_min + len_avg) / 2 or l > (3 * len_max - 2 * len_avg) / 4: continue words = [] sent_words = jieba.cut(sent) # <generator object cut at 0x11B38120> for i in sent_words: words.append(i) keywords_cnt = 0 len_sent = len(words) if len_sent == 0: continue for word in words: if word in keywords: keywords_cnt += 1 score = keywords_cnt * keywords_cnt * 1.0 / len_sent sents_score[sent] = score if sentences.index(sent) == 0:# 提高首句權重 sents_score[sent] = 2 * score #**** 排序 ********************** dict_list = sorted(sents_score.items(),key=lambda x:x[1],reverse=True) # print(dict_list) #**** 返回topN ****************** sents_topN = [] for i in dict_list[:n]: sents_topN.append(i[0]) # print i[0],i[1] sents_topN = list(set(sents_topN)) #**** 按比例提取 ************************** if len_sentences <= 5: sents_topN = sents_topN[:1] elif len_sentences < 9: sents_topN = sents_topN[:2] return sents_topN #**** 恢復topN句子在文中的相對順序 ********* def sents_sort(self,sents_topN,sentences): keysents = [] for sent in sentences: if sent in sents_topN and sent not in keysents: keysents.append(sent) keysents = self.post_processing(keysents) return keysents def post_processing(self,keysents): #**** 刪除不完整句子中的詳細部分 ******************** detail_tags = [',一是',':一是',',第一,',':第一,',',首先,',';首先,'] for i in keysents: for tag in detail_tags: index = i.find(tag) if index != -1: keysents[keysents.index(i)] = i[:index] #**** 刪除編號 **************************** for i in keysents: # print(i) regex = re.compile(r'^一、|^二、|^三、|^三、|^四、|^五、|^六、|^七、|^八、|^九、|^十、|^\d{1,2}、|^\d{1,2} ') result = re.findall(regex,i) if result: keysents[keysents.index(i)] = re.sub(regex,'',i) #**** 刪除備注性質的句子 ******************** for i in keysents: regex = re.compile(r'^注\d*:') result = re.findall(regex,i) if result: keysents.remove(i) #**** 刪除句首括號中的內(nèi)容 ******************** for i in keysents: regex = re.compile(r'^\[.*\]') result = re.findall(regex,i) if result: keysents[keysents.index(i)] = re.sub(regex,'',i) #**** 刪除來源(空格前的部分) ******************** for i in keysents: regex = re.compile(r'^.{1,20} ') result = re.findall(regex,i) if result: keysents[keysents.index(i)] = re.sub(regex,'',i) #**** 刪除引號部分(如:銀行間債市小幅下跌,見下圖:) ******************** for i in keysents: regex = re.compile(r',[^,]+:$') result = re.findall(regex,i) if result: keysents[keysents.index(i)] = re.sub(regex,'',i) return keysents def main(self,title,text): sentences = self.cutSentence(text) keywords = self.getKeywords(title, sentences, n=8) sents_topN = self.getTopNSentences(sentences, keywords, n=3) keysents = self.sents_sort(sents_topN, sentences) print(keysents) return keysents if __name__=='__main__': summary=Summary() summary.main(title,text)
方法二:根據(jù)內(nèi)容生成摘要
import pyhanlp from text import text summary = pyhanlp.HanLP.extractSummary(text, 3) print(summary)
以上這篇python根據(jù)文章標題內(nèi)容自動生成摘要的實例就是小編分享給大家的全部內(nèi)容了,希望能給大家一個參考,也希望大家多多支持腳本之家。
相關文章
Python3中小括號()、中括號[]、花括號{}的區(qū)別詳解
這篇文章主要介紹了Python3中小括號()、中括號[]、花括號{}的區(qū)別詳解,文中通過示例代碼介紹的非常詳細,對大家的學習或者工作具有一定的參考學習價值,需要的朋友們下面隨著小編來一起學習學習吧2020-11-11python網(wǎng)絡編程學習筆記(五):socket的一些補充
前面已經(jīng)為大家介紹了python socket的一些相關知識,這里為大家補充下,方便需要的朋友2014-06-06python flask基于cookie和session來實現(xiàn)會話控制的實戰(zhàn)代碼
所謂的會話(session),就是客戶端瀏覽器和服務端網(wǎng)站之間一次完整的交互過程,本文介紹falsk通過cookie和session來控制http會話的全部解析,通常我們可以用cookie和session來保持用戶登錄等,感興趣的朋友一起看看吧2024-03-03