快捷導(dǎo)航

python根據(jù)文章標(biāo)題內(nèi)容自動(dòng)生成摘要的實(shí)例

更新時(shí)間：2019年02月21日 10:16:08 作者：周小董

今天小編就為大家分享一篇python根據(jù)文章標(biāo)題內(nèi)容自動(dòng)生成摘要的實(shí)例，具有很好的參考價(jià)值，希望對(duì)大家有所幫助。一起跟隨小編過來看看吧

text.py

方法一：根據(jù)標(biāo)題內(nèi)容生成摘要

# -*- coding: utf-8 -*-
import jieba,copy,re,codecs
from collections import Counter

from text import title,text


class Summary():
 #**** 切分句子 ************
 def cutSentence(self,text):
  sents = []
  text = re.sub(r'\n+','。',text) # 換行改成句號(hào)（標(biāo)題段無句號(hào)的情況）
  text = text.replace('。。','。') # 刪除多余的句號(hào)
  text = text.replace('？。','。') #
  text = text.replace('！。','。') # 刪除多余的句號(hào)
  sentences = re.split(r'。|！|？|】|；',text) # 分句
  #print(sentences)
  sentences = sentences[:-1] # 刪除最后一個(gè)句號(hào)后面的空句
  for sent in sentences:
   len_sent = len(sent)
   if len_sent < 4: # 刪除換行符、一個(gè)字符等
    continue
   # sent = sent.decode('utf8')
   sent = sent.strip('　 ')
   sent = sent.lstrip('【')
   sents.append(sent)
  return sents

 #**** 提取特征詞 **********************
 def getKeywords(self,title,sentences,n=10):
  words = []
  #**** 分詞，獲取詞匯列表 *****
  # split_result = pseg.cut(text)
  for sentence in sentences:
   split_result = jieba.cut(sentence)
   for i in split_result:
    words.append(i)
  #**** 統(tǒng)計(jì)詞頻TF *****
  c = Counter(words) # 詞典
  #**** 去除停用詞(為了提高效率，該步驟放到統(tǒng)計(jì)詞頻之后)
  self.delStopwords(c)
  #**** 標(biāo)題中提取特征 *********
  words_title = [word for word in jieba.cut(title,cut_all=True)]
  self.delStopwords(words_title)
  #**** 獲取topN ************
  topN = c.most_common(n)
  # for i in topN:
  #  print(i[0],i[1])
  words_topN = [i[0] for i in topN if i[1]>1] #在topN中排除出現(xiàn)次數(shù)少于2次的詞

  words_topN = list(set(words_topN)|set(words_title)) # 正文關(guān)鍵詞與標(biāo)題關(guān)鍵詞取并集

  print (' '.join(words_topN))
  return words_topN

 #**** 去除停用詞 *******************************
 def delStopwords(self,dict):
  sw_file = codecs.open('stopwords.txt',encoding='utf8')
  stop_words = []
  for line in sw_file.readlines():
   stop_words.append(line.strip())
  #***** 輸入?yún)?shù)為list *************
  # if type(dict) is types.ListType:
  if type(dict) is list:
   words = dict
   for word in words:
    if word in stop_words:
     words.remove(word)
  #***** 輸入?yún)?shù)type為 <class 'collections.Counter'> *****
  else:
   words = copy.deepcopy(list(dict.keys()))
   for word in words:
    if word in stop_words:
     del dict[word]
  return words

 #**** 提取topN句子 **********************
 def getTopNSentences(self,sentences,keywords,n=3):
  sents_score = {}
  len_sentences = len(sentences)
  #**** 初始化句子重要性得分，并計(jì)算句子平均長(zhǎng)度
  len_avg = 0
  len_min = len(sentences[0])
  len_max = len(sentences[0])
  for sent in sentences:
   sents_score[sent] = 0
   l = len(sent)
   len_avg += l
   if len_min > l:
    len_min = l
   if len_max < l:
    len_max = l
  len_avg = len_avg / len_sentences
  # print(len_min,len_avg,len_max)
  #**** 計(jì)算句子權(quán)重得分 **********
  for sent in sentences:
   #**** 不考慮句長(zhǎng)在指定范圍外的句子 ******
   l = len(sent)
   if l < (len_min + len_avg) / 2 or l > (3 * len_max - 2 * len_avg) / 4:
    continue
   words = []
   sent_words = jieba.cut(sent) # <generator object cut at 0x11B38120>
   for i in sent_words:
    words.append(i)
   keywords_cnt = 0
   len_sent = len(words)
   if len_sent == 0:
    continue

   for word in words:
    if word in keywords:
     keywords_cnt += 1
   score = keywords_cnt * keywords_cnt * 1.0 / len_sent
   sents_score[sent] = score
   if sentences.index(sent) == 0:# 提高首句權(quán)重
    sents_score[sent] = 2 * score
  #**** 排序 **********************
  dict_list = sorted(sents_score.items(),key=lambda x:x[1],reverse=True)
  # print(dict_list)
  #**** 返回topN ******************
  sents_topN = []
  for i in dict_list[:n]:
   sents_topN.append(i[0])
   # print i[0],i[1]
  sents_topN = list(set(sents_topN))
  #**** 按比例提取 **************************
  if len_sentences <= 5:
   sents_topN = sents_topN[:1]
  elif len_sentences < 9:
   sents_topN = sents_topN[:2]

  return sents_topN

 #**** 恢復(fù)topN句子在文中的相對(duì)順序 *********
 def sents_sort(self,sents_topN,sentences):
  keysents = []
  for sent in sentences:
   if sent in sents_topN and sent not in keysents:
    keysents.append(sent)
  keysents = self.post_processing(keysents)

  return keysents

 def post_processing(self,keysents):
  #**** 刪除不完整句子中的詳細(xì)部分 ********************
  detail_tags = ['，一是','：一是','，第一，','：第一，','，首先，','；首先，']
  for i in keysents:
   for tag in detail_tags:
    index = i.find(tag)
    if index != -1:
     keysents[keysents.index(i)] = i[:index]
  #**** 刪除編號(hào) ****************************
  for i in keysents:
   # print(i)
   regex = re.compile(r'^一、|^二、|^三、|^三、|^四、|^五、|^六、|^七、|^八、|^九、|^十、|^\d{1,2}、|^\d{1,2} ')
   result = re.findall(regex,i)
   if result:
    keysents[keysents.index(i)] = re.sub(regex,'',i)
  #**** 刪除備注性質(zhì)的句子 ********************
  for i in keysents:
   regex = re.compile(r'^注\d*：')
   result = re.findall(regex,i)
   if result:
    keysents.remove(i)
  #**** 刪除句首括號(hào)中的內(nèi)容 ********************
  for i in keysents:
   regex = re.compile(r'^\[.*\]')
   result = re.findall(regex,i)
   if result:
    keysents[keysents.index(i)] = re.sub(regex,'',i)
  #**** 刪除來源(空格前的部分) ********************
  for i in keysents:
   regex = re.compile(r'^.{1,20} ')
   result = re.findall(regex,i)
   if result:
    keysents[keysents.index(i)] = re.sub(regex,'',i)
  #**** 刪除引號(hào)部分（如：銀行間債市小幅下跌，見下圖：） ********************
  for i in keysents:
   regex = re.compile(r'，[^，]+：$')
   result = re.findall(regex,i)
   if result:
    keysents[keysents.index(i)] = re.sub(regex,'',i)

  return keysents

 def main(self,title,text):
  sentences = self.cutSentence(text)
  keywords = self.getKeywords(title, sentences, n=8)
  sents_topN = self.getTopNSentences(sentences, keywords, n=3)
  keysents = self.sents_sort(sents_topN, sentences)
  print(keysents)
  return keysents


if __name__=='__main__':
 summary=Summary()
 summary.main(title,text)

python根據(jù)文章標(biāo)題內(nèi)容自動(dòng)生成摘要

方法二：根據(jù)內(nèi)容生成摘要

import pyhanlp
from text import text

summary = pyhanlp.HanLP.extractSummary(text, 3)
print(summary)

python根據(jù)文章標(biāo)題內(nèi)容自動(dòng)生成摘要

以上這篇python根據(jù)文章標(biāo)題內(nèi)容自動(dòng)生成摘要的實(shí)例就是小編分享給大家的全部?jī)?nèi)容了，希望能給大家一個(gè)參考，也希望大家多多支持腳本之家。

欧美bbbwbbbw肥妇,免费乱码人妻系列日韩,一级黄片

python根據(jù)文章標(biāo)題內(nèi)容自動(dòng)生成摘要的實(shí)例

相關(guān)文章

最新評(píng)論

大家感興趣的內(nèi)容

最近更新的內(nèi)容

常用在線小工具