C#實(shí)現(xiàn)將HTML轉(zhuǎn)換成純文本的方法
更新時(shí)間:2015年07月18日 12:15:30 作者:鑒客
這篇文章主要介紹了C#實(shí)現(xiàn)將HTML轉(zhuǎn)換成純文本的方法,基于自定義類實(shí)現(xiàn)文本轉(zhuǎn)換功能,具有一定參考借鑒價(jià)值,需要的朋友可以參考下
本文實(shí)例講述了C#實(shí)現(xiàn)將HTML轉(zhuǎn)換成純文本的方法。分享給大家供大家參考。具體如下:
使用方法:
復(fù)制代碼 代碼如下:
HtmlToText convert = new HtmlToText();
textBox2.Text = convert.Convert(textBox1.Text);
textBox2.Text = convert.Convert(textBox1.Text);
C#代碼如下:
/// <summary> /// Converts HTML to plain text. /// </summary> class HtmlToText { // Static data tables protected static Dictionary<string, string> _tags; protected static HashSet<string> _ignoreTags; // Instance variables protected TextBuilder _text; protected string _html; protected int _pos; // Static constructor (one time only) static HtmlToText() { _tags = new Dictionary<string, string>(); _tags.Add("address", "\n"); _tags.Add("blockquote", "\n"); _tags.Add("div", "\n"); _tags.Add("dl", "\n"); _tags.Add("fieldset", "\n"); _tags.Add("form", "\n"); _tags.Add("h1", "\n"); _tags.Add("/h1", "\n"); _tags.Add("h2", "\n"); _tags.Add("/h2", "\n"); _tags.Add("h3", "\n"); _tags.Add("/h3", "\n"); _tags.Add("h4", "\n"); _tags.Add("/h4", "\n"); _tags.Add("h5", "\n"); _tags.Add("/h5", "\n"); _tags.Add("h6", "\n"); _tags.Add("/h6", "\n"); _tags.Add("p", "\n"); _tags.Add("/p", "\n"); _tags.Add("table", "\n"); _tags.Add("/table", "\n"); _tags.Add("ul", "\n"); _tags.Add("/ul", "\n"); _tags.Add("ol", "\n"); _tags.Add("/ol", "\n"); _tags.Add("/li", "\n"); _tags.Add("br", "\n"); _tags.Add("/td", "\t"); _tags.Add("/tr", "\n"); _tags.Add("/pre", "\n"); _ignoreTags = new HashSet<string>(); _ignoreTags.Add("script"); _ignoreTags.Add("noscript"); _ignoreTags.Add("style"); _ignoreTags.Add("object"); } /// <summary> /// Converts the given HTML to plain text and returns the result. /// </summary> /// <param name="html">HTML to be converted</param> /// <returns>Resulting plain text</returns> public string Convert(string html) { // Initialize state variables _text = new TextBuilder(); _html = html; _pos = 0; // Process input while (!EndOfText) { if (Peek() == '<') { // HTML tag bool selfClosing; string tag = ParseTag(out selfClosing); // Handle special tag cases if (tag == "body") { // Discard content before <body> _text.Clear(); } else if (tag == "/body") { // Discard content after </body> _pos = _html.Length; } else if (tag == "pre") { // Enter preformatted mode _text.Preformatted = true; EatWhitespaceToNextLine(); } else if (tag == "/pre") { // Exit preformatted mode _text.Preformatted = false; } string value; if (_tags.TryGetValue(tag, out value)) _text.Write(value); if (_ignoreTags.Contains(tag)) EatInnerContent(tag); } else if (Char.IsWhiteSpace(Peek())) { // Whitespace (treat all as space) _text.Write(_text.Preformatted ? Peek() : ' '); MoveAhead(); } else { // Other text _text.Write(Peek()); MoveAhead(); } } // Return result return HttpUtility.HtmlDecode(_text.ToString()); } // Eats all characters that are part of the current tag // and returns information about that tag protected string ParseTag(out bool selfClosing) { string tag = String.Empty; selfClosing = false; if (Peek() == '<') { MoveAhead(); // Parse tag name EatWhitespace(); int start = _pos; if (Peek() == '/') MoveAhead(); while (!EndOfText && !Char.IsWhiteSpace(Peek()) && Peek() != '/' && Peek() != '>') MoveAhead(); tag = _html.Substring(start, _pos - start).ToLower(); // Parse rest of tag while (!EndOfText && Peek() != '>') { if (Peek() == '"' || Peek() == '\'') EatQuotedValue(); else { if (Peek() == '/') selfClosing = true; MoveAhead(); } } MoveAhead(); } return tag; } // Consumes inner content from the current tag protected void EatInnerContent(string tag) { string endTag = "/" + tag; while (!EndOfText) { if (Peek() == '<') { // Consume a tag bool selfClosing; if (ParseTag(out selfClosing) == endTag) return; // Use recursion to consume nested tags if (!selfClosing && !tag.StartsWith("/")) EatInnerContent(tag); } else MoveAhead(); } } // Returns true if the current position is at the end of // the string protected bool EndOfText { get { return (_pos >= _html.Length); } } // Safely returns the character at the current position protected char Peek() { return (_pos < _html.Length) ? _html[_pos] : (char)0; } // Safely advances to current position to the next character protected void MoveAhead() { _pos = Math.Min(_pos + 1, _html.Length); } // Moves the current position to the next non-whitespace // character. protected void EatWhitespace() { while (Char.IsWhiteSpace(Peek())) MoveAhead(); } // Moves the current position to the next non-whitespace // character or the start of the next line, whichever // comes first protected void EatWhitespaceToNextLine() { while (Char.IsWhiteSpace(Peek())) { char c = Peek(); MoveAhead(); if (c == '\n') break; } } // Moves the current position past a quoted value protected void EatQuotedValue() { char c = Peek(); if (c == '"' || c == '\'') { // Opening quote MoveAhead(); // Find end of value int start = _pos; _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos); if (_pos < 0) _pos = _html.Length; else MoveAhead(); // Closing quote } } /// <summary> /// A StringBuilder class that helps eliminate excess whitespace. /// </summary> protected class TextBuilder { private StringBuilder _text; private StringBuilder _currLine; private int _emptyLines; private bool _preformatted; // Construction public TextBuilder() { _text = new StringBuilder(); _currLine = new StringBuilder(); _emptyLines = 0; _preformatted = false; } /// <summary> /// Normally, extra whitespace characters are discarded. /// If this property is set to true, they are passed /// through unchanged. /// </summary> public bool Preformatted { get { return _preformatted; } set { if (value) { // Clear line buffer if changing to // preformatted mode if (_currLine.Length > 0) FlushCurrLine(); _emptyLines = 0; } _preformatted = value; } } /// <summary> /// Clears all current text. /// </summary> public void Clear() { _text.Length = 0; _currLine.Length = 0; _emptyLines = 0; } /// <summary> /// Writes the given string to the output buffer. /// </summary> /// <param name="s"></param> public void Write(string s) { foreach (char c in s) Write(c); } /// <summary> /// Writes the given character to the output buffer. /// </summary> /// <param name="c">Character to write</param> public void Write(char c) { if (_preformatted) { // Write preformatted character _text.Append(c); } else { if (c == '\r') { // Ignore carriage returns. We'll process // '\n' if it comes next } else if (c == '\n') { // Flush current line FlushCurrLine(); } else if (Char.IsWhiteSpace(c)) { // Write single space character int len = _currLine.Length; if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1])) _currLine.Append(' '); } else { // Add character to current line _currLine.Append(c); } } } // Appends the current line to output buffer protected void FlushCurrLine() { // Get current line string line = _currLine.ToString().Trim(); // Determine if line contains non-space characters string tmp = line.Replace(" ", String.Empty); if (tmp.Length == 0) { // An empty line _emptyLines++; if (_emptyLines < 2 && _text.Length > 0) _text.AppendLine(line); } else { // A non-empty line _emptyLines = 0; _text.AppendLine(line); } // Reset current line _currLine.Length = 0; } /// <summary> /// Returns the current output as a string. /// </summary> public override string ToString() { if (_currLine.Length > 0) FlushCurrLine(); return _text.ToString(); } } }
希望本文所述對(duì)大家的C#程序設(shè)計(jì)有所幫助。
您可能感興趣的文章:
- ASP.net(c#) 生成html的幾種解決方案[思路]
- C#將html table 導(dǎo)出成excel實(shí)例
- C#下解析HTML的兩種方法介紹
- 使用C#獲取網(wǎng)頁(yè)HTML源碼的例子
- asp.net(C#) 動(dòng)態(tài)添加非ASP的標(biāo)準(zhǔn)html控件(如添加Script標(biāo)簽)
- C#導(dǎo)出生成excel文件的方法小結(jié)(xml,html方式)
- c#中過(guò)濾html的正則表達(dá)式
- C#正則表達(dá)式匹配HTML中的圖片路徑,圖片地址代碼
- C#實(shí)現(xiàn)下載網(wǎng)頁(yè)HTML源碼的方法
- C#獲取HTML文本的第一張圖片與截取內(nèi)容摘要示例代碼
相關(guān)文章
詳細(xì)介紹C#之文件校驗(yàn)工具的開(kāi)發(fā)及問(wèn)題
目前校驗(yàn)文件使用最多的是MD值和SHA值,不外乎有些使用CRC,前段時(shí)間微軟發(fā)布了VisualStudio正式版,win鏡像,微軟官方給出的校驗(yàn)方式都是校驗(yàn)文件的SHA值。下面詳細(xì)介紹C#之文件校驗(yàn)工具的開(kāi)發(fā)及問(wèn)題,需要的朋友可以參考下2015-07-07C#設(shè)計(jì)模式之建造者模式生成器模式示例詳解
這篇文章主要為大家介紹了C#設(shè)計(jì)模式之建造者模式生成器模式示例詳解,有需要的朋友可以借鑒參考下,希望能夠有所幫助,祝大家多多進(jìn)步,早日升職加薪2022-08-08巧用Dictionary實(shí)現(xiàn)日志數(shù)據(jù)批量插入
這篇文章主要介紹了巧用Dictionary實(shí)現(xiàn)日志數(shù)據(jù)批量插入,本文給大家介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或工作具有一定的參考借鑒價(jià)值,需要的朋友可以參考下2021-02-02.NET/C# 使用Stopwatch測(cè)量運(yùn)行時(shí)間
這篇文章主要介紹了.NET/C# 使用Stopwatch測(cè)量運(yùn)行時(shí)間,文中通過(guò)示例代碼介紹的非常詳細(xì),對(duì)大家的學(xué)習(xí)或者工作具有一定的參考學(xué)習(xí)價(jià)值,需要的朋友們下面隨著小編來(lái)一起學(xué)習(xí)學(xué)習(xí)吧2020-01-01C#實(shí)現(xiàn)讀取指定盤符硬盤序列號(hào)的方法
這篇文章主要介紹了C#實(shí)現(xiàn)讀取指定盤符硬盤序列號(hào)的方法,涉及C#針對(duì)硬件屬性的相關(guān)操作技巧,具有一定參考借鑒價(jià)值,需要的朋友可以參考下2016-08-08