#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Get a summary of the TEXT-format document"""
def get_summary(text, count):
  u"""Get the first `count` characters from `text`
    >>> text = u'Welcome 這是一篇關(guān)于Python的文章'
    >>> get_summary(text, 12) == u'Welcome 這是一篇'
    True
  """
  assert(isinstance(text, unicode))
  return text[0:count]
if __name__ == '__main__':
  import doctest
  doctest.testmod()

三、HTML摘要

HTML文檔中包含大量標(biāo)記符（如<h1>、<p>、<a>等等），這些字符都是標(biāo)記指令，并且通常是成對出現(xiàn)的，簡單的文本截取會(huì)破壞HTML的文檔結(jié)構(gòu)，進(jìn)而導(dǎo)致摘要在瀏覽器中顯示不當(dāng)。

在遵循HTML文檔結(jié)構(gòu)的同時(shí)，又要對內(nèi)容進(jìn)行截取，就需要解析HTML文檔。在Python中，可以借助標(biāo)準(zhǔn)庫 HTMLParser 來完成。

一個(gè)最簡單的摘要提取功能，是忽略HTML標(biāo)記符而只提取標(biāo)記內(nèi)部的原生文本。以下就是類似該功能的Python實(shí)現(xiàn)：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Get a raw summary of the HTML-format document"""
from HTMLParser import HTMLParser
class SummaryHTMLParser(HTMLParser):
  """Parse HTML text to get a summary
    >>> text = u'<p>Hi guys:</p><p>This is a example using SummaryHTMLParser.</p>'
    >>> parser = SummaryHTMLParser(10)
    >>> parser.feed(text)
    >>> parser.get_summary(u'...')
    u'<p>Higuys:Thi...</p>'
  """
  def __init__(self, count):
    HTMLParser.__init__(self)
    self.count = count
    self.summary = u''
  def feed(self, data):
    """Only accept unicode `data`"""
    assert(isinstance(data, unicode))
    HTMLParser.feed(self, data)
  def handle_data(self, data):
    more = self.count - len(self.summary)
    if more > 0:
      # Remove possible whitespaces in `data`
      data_without_whitespace = u''.join(data.split())
      self.summary += data_without_whitespace[0:more]
  def get_summary(self, suffix=u'', wrapper=u'p'):
    return u'<{0}>{1}{2}</{0}>'.format(wrapper, self.summary, suffix)
if __name__ == '__main__':
  import doctest
  doctest.testmod()

HTMLParser（或者 BeautifulSoup 等等）更適合完成復(fù)雜的HTML摘要提取功能，對于上述簡單的HTML摘要提取功能，其實(shí)有更簡潔的實(shí)現(xiàn)方案（相比 SummaryHTMLParser 而言）：

#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""Get a raw summary of the HTML-format document"""
import re
def get_summary(text, count, suffix=u'', wrapper=u'p'):
  """A simpler implementation (vs `SummaryHTMLParser`).
    >>> text = u'<p>Hi guys:</p><p>This is a example using SummaryHTMLParser.</p>'
    >>> get_summary(text, 10, u'...')
    u'<p>Higuys:Thi...</p>'
  """
  assert(isinstance(text, unicode))
  summary = re.sub(r'<.*?>', u'', text) # key difference: use regex
  summary = u''.join(summary.split())[0:count]
  return u'<{0}>{1}{2}</{0}>'.format(wrapper, summary, suffix)
if __name__ == '__main__':
  import doctest
  doctest.testmod()

希望本文所述對大家的Python程序設(shè)計(jì)有所幫助。

您可能感興趣的文章: