C#自寫的一個HTML解析類(類似XElement語法)
功能:
1、輕松獲取指元素HTML元素。
2、可以根據(jù)屬性標(biāo)簽進(jìn)行篩選
3、返回的都是Llist強(qiáng)類型無需轉(zhuǎn)換
用過XElement的都知道 用來解析XML非常的方便,但是對于HTML的格式多樣化實在是沒辦法兼容。
所以我就寫了這么一個類似XElement的 XHTMLElement
用法:
string filePath = Server.MapPath("~/file/test.htm");
//獲取HTML代碼
string mailBody = FileHelper.FileToString(filePath);
XHtmlElement xh = new XHtmlElement(mailBody);
//獲取body的子集a標(biāo)簽并且class="icon"
var link = xh.Descendants("body").ChildDescendants("a").Where(c => c.Attributes.Any(a => a.Key == "class" && a.Value == "icon")).ToList();
//獲取帶href的a元素
var links = xh.Descendants("a").Where(c => c.Attributes.Any(a => a.Key == "href")).ToList();
foreach (var r in links)
{
Response.Write(r.Attributes.Single(c => c.Key == "href").Value); //出輸href
}
//獲取第一個img
var img = xh.Descendants("img");
//獲取最近的第一個p元素以及與他同一級的其它p元素
var ps = xh.Descendants("p");
代碼:
using System;
using System.Collections.Generic;
using System.Linq;
using System.Web;
using System.Text;
using System.Text.RegularExpressions;
namespace SyntacticSugar
{
/// <summary>
/// ** 描述:html解析類
/// ** 創(chuàng)始時間:2015-4-23
/// ** 修改時間:-
/// ** 作者:sunkaixuan
/// ** qq:610262374 歡迎交流,共同提高 ,命名語法等寫的不好的地方歡迎大家的給出寶貴建議
/// </summary>
public class XHtmlElement
{
private string _html;
public XHtmlElement(string html)
{
_html = html;
}
/// <summary>
/// 獲取最近的相同層級的HTML元素
/// </summary>
/// <param name="elementName">等于null為所有元素</param>
/// <returns></returns>
public List<HtmlInfo> Descendants(string elementName = null)
{
if (_html == null)
{
throw new ArgumentNullException("html不能這空!");
}
var allList = RootDescendants(_html);
var reval = allList.Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
if (reval == null || reval.Count == 0)
{
reval = GetDescendantsSource(allList, elementName);
}
return reval;
}
/// <summary>
/// 獲取第一級元素
/// </summary>
/// <param name="elementName"></param>
/// <returns></returns>
public List<HtmlInfo> RootDescendants(string html = null)
{
/*
* 業(yè)務(wù)邏輯:
* 1、獲取第一個html標(biāo)簽一直找結(jié)尾標(biāo)簽,如果在這個過程中遇到相同的標(biāo)簽收尾標(biāo)簽就要加1
* 2、第一個標(biāo)簽取到后繼續(xù)第一步操作,找第2個元素 。。第N個元素
*/
if (html == null) html = _html;
var firstTag = Regex.Match(html, "<.+?>");
List<string> eleList = new List<string>();
List<HtmlInfo> reval = new List<HtmlInfo>();
GetElementsStringList(html, ref eleList);
foreach (var r in eleList)
{
HtmlInfo data = new HtmlInfo();
data.OldFullHtml = r;
data.SameLeveHtml = html;
data.TagName = Regex.Match(r, @"(?<=\s{1}|\<)[a-z,A-Z]+(?=\>|\s)", RegexOptions.IgnoreCase).Value;
data.InnerHtml = Regex.Match(r, @"(?<=\>).+(?=<)", RegexOptions.Singleline).Value;
var eleBegin = Regex.Match(r, "<.+?>").Value;
var attrList = Regex.Matches(eleBegin, @"[a-z,A-Z]+\="".+?""").Cast<Match>().Select(c => new { key = c.Value.Split('=').First(), value = c.Value.Split('=').Last().TrimEnd('"').TrimStart('"') }).ToList();
data.Attributes = new Dictionary<string, string>();
if (attrList != null && attrList.Count > 0)
{
foreach (var a in attrList)
{
data.Attributes.Add(a.key, a.value);
}
}
reval.Add(data);
}
return reval;
}
#region private
private List<HtmlInfo> GetDescendantsSource(List<HtmlInfo> allList, string elementName)
{
foreach (var r in allList)
{
if (r.InnerHtml == null || !r.InnerHtml.Contains("<")) continue;
var childList = RootDescendants(r.InnerHtml).Where(c => elementName == null || c.TagName.ToLower() == elementName.ToLower()).ToList();
if (childList == null || childList.Count == 0)
{
childList = GetDescendantsSource(RootDescendants(r.InnerHtml), elementName);
if (childList != null && childList.Count > 0)
return childList;
}
else
{
return childList;
}
}
return null;
}
private void GetElementsStringList(string html, ref List<string> eleList)
{
HtmlInfo info = new HtmlInfo();
info.TagName = Regex.Match(html, @"(?<=\<\s{0,5}|\<)([a-z,A-Z]+|h\d{1})(?=\>|\s)", RegexOptions.IgnoreCase).Value;
string currentTagBeginReg = @"<\s{0,10}" + info.TagName + @".*?>";//獲取當(dāng)前標(biāo)簽元素開始標(biāo)簽正則
string currentTagEndReg = @"\<\/" + info.TagName + @"\>";//獲取當(dāng)前標(biāo)簽元素收尾標(biāo)簽正則
if (string.IsNullOrEmpty(info.TagName)) return;
string eleHtml = "";
//情況1 <a/>
//情況2 <a></a>
//情況3 <a> 錯誤格式
//情況4endif
if (Regex.IsMatch(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>"))//單標(biāo)簽
{
eleHtml = Regex.Match(html, @"<\s{0,10}" + info.TagName + "[^<].*?/>").Value;
}
else if (!Regex.IsMatch(html, currentTagEndReg))//沒有收尾
{
if (Regex.IsMatch(html, @"\s{0,10}\<\!\-\-\[if"))
{
eleHtml = GetElementString(html, @"\s{0,10}\<\!\-\-\[if", @"\[endif\]\-\-\>", 1);
}
else
{
eleHtml = Regex.Match(html, currentTagBeginReg,RegexOptions.Singleline).Value;
}
}
else
{
eleHtml = GetElementString(html, currentTagBeginReg, currentTagEndReg, 1);
}
try
{
eleList.Add(eleHtml);
html = html.Replace(eleHtml, "");
html = Regex.Replace(html, @"<\!DOCTYPE.*?>", "");
if (!Regex.IsMatch(html, @"^\s*$"))
{
GetElementsStringList(html, ref eleList);
}
}
catch (Exception ex)
{
throw new Exception("SORRY,您的HTML格式不能解析!?。?);
}
}
private string GetElementString(string html, string currentTagBeginReg, string currentTagEndReg, int i)
{
string newHtml = GetRegNextByNum(html, currentTagBeginReg, currentTagEndReg, i);
var currentTagBeginMatches = Regex.Matches(newHtml, currentTagBeginReg, RegexOptions.Singleline).Cast<Match>().Select(c => c.Value).ToList();
var currentTagEndMatches = Regex.Matches(newHtml, currentTagEndReg).Cast<Match>().Select(c => c.Value).ToList();
if (currentTagBeginMatches.Count == currentTagEndMatches.Count)
{ //兩個簽標(biāo)元素相等
return newHtml;
}
return GetElementString(html, currentTagBeginReg, currentTagEndReg, ++i);
}
private string GetRegNextByNum(string val, string currentTagBeginReg, string currentTagEndReg, int i)
{
return Regex.Match(val, currentTagBeginReg + @"((.*?)" + currentTagEndReg + "){" + i + "}?", RegexOptions.IgnoreCase | RegexOptions.Singleline).Value;
}
#endregion
}
public static class XHtmlElementExtendsion
{
/// <summary>
/// 獲取最近的相同層級的HTML元素
/// </summary>
/// <param name="elementName">等于null為所有元素</param>
/// <returns></returns>
public static List<HtmlInfo> Descendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
{
var html = htmlInfoList.First().InnerHtml;
XHtmlElement xhe = new XHtmlElement(html);
return xhe.Descendants(elementName);
}
/// <summary>
/// 獲取下級元素
/// </summary>
/// <param name="elementName"></param>
/// <returns></returns>
public static List<HtmlInfo> ChildDescendants(this IEnumerable<HtmlInfo> htmlInfoList, string elementName = null)
{
var html = htmlInfoList.First().InnerHtml;
XHtmlElement xhe = new XHtmlElement(html);
return xhe.RootDescendants(html).Where(c => elementName == null || c.TagName == elementName).ToList();
}
/// <summary>
/// 獲取父級
/// </summary>
/// <param name="htmlInfoList"></param>
/// <returns></returns>
public static List<HtmlInfo> ParentDescendant(this IEnumerable<HtmlInfo> htmlInfoList,string fullHtml)
{
var saveLeveHtml = htmlInfoList.First().SameLeveHtml;
string replaceGuid=Guid.NewGuid().ToString();
fullHtml = fullHtml.Replace(saveLeveHtml,replaceGuid);
var parentHtml = Regex.Match(fullHtml, @"<[^<]+?>[^<]*?" + replaceGuid + @".*?<\/.+?>").Value;
parentHtml = parentHtml.Replace(replaceGuid, saveLeveHtml);
XHtmlElement xhe = new XHtmlElement(parentHtml);
return xhe.RootDescendants();
}
}
/// <summary>
/// html信息類
/// </summary>
public class HtmlInfo
{
/// <summary>
/// 元素名
/// </summary>
public string TagName { get; set; }
/// <summary>
/// 元素屬性
/// </summary>
public Dictionary<string, string> Attributes { get; set; }
/// <summary>
/// 元素內(nèi)部html
/// </summary>
public string InnerHtml { get; set; }
public string OldFullHtml { get; set; }
public string SameLeveHtml { get; set; }
/// <summary>
/// 得到元素的html
/// </summary>
/// <returns></returns>
public string FullHtml
{
get
{
StringBuilder reval = new StringBuilder();
string attributesString = string.Empty;
if (Attributes != null && Attributes.Count > 0)
{
attributesString = string.Join(" ", Attributes.Select(c => string.Format("{0}=\"{1}\"", c.Key, c.Value)));
}
reval.AppendFormat("<{0} {2}>{1}</{0}>", TagName, InnerHtml, attributesString);
return reval.ToString();
}
}
}
}
前臺HTML:
<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd"> <html xmlns="http://www.w3.org/1999/xhtml"> <head> <title></title> </head> <body> <a id="1">我是1</a> <a id="2" class="icon">icon</a> <img /> </body> </html>
- c#使用htmlagilitypack解析html格式字符串
- C#抓取網(wǎng)頁數(shù)據(jù) 解析標(biāo)題描述圖片等信息 去除HTML標(biāo)簽
- c#使用nsoup解析html亂碼解決方法分享 nsoup教程
- C#下解析HTML的兩種方法介紹
- C# 使用 WebBrowser 實現(xiàn) HTML 轉(zhuǎn)圖片功能的示例代碼
- 利用C#代碼將html樣式文件與Word文檔互換的方法
- C#正則過濾HTML標(biāo)簽并保留指定標(biāo)簽的方法
- C#基于正則表達(dá)式抓取a標(biāo)簽鏈接和innerhtml的方法
- 通過C#實現(xiàn)發(fā)送自定義的html格式郵件
- c# 使用HtmlAgilityPack解析Html
C#/VB.NET實現(xiàn)從PPT中提取圖片的示例代碼
C# 16進(jìn)制與字符串、字節(jié)數(shù)組之間的轉(zhuǎn)換
C#中圖片.BYTE[]和base64string的轉(zhuǎn)換方法
C#調(diào)用dos窗口獲取相關(guān)信息的方法
淺談C#手機(jī)號換成111XXXX1111 這種顯示的解決思路
C#使用DateAndTime.DateDiff實現(xiàn)計算年齡

