日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當(dāng)前位置: 首頁 > 编程语言 > C# >内容正文

C#

C#---HTML 转文本及HTML内容提取

發(fā)布時間:2025/3/21 C# 18 豆豆
生活随笔 收集整理的這篇文章主要介紹了 C#---HTML 转文本及HTML内容提取 小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

//1、HTML直接轉(zhuǎn)文本//使用方法 HtmlToText convert = new HtmlToText(); textBox2.Text = convert.Convert(textBox1.Text);//代碼 /// <summary> /// Converts HTML to plain text. /// </summary> class HtmlToText {// Static data tablesprotected static Dictionary<string, string> _tags;protected static HashSet<string> _ignoreTags;// Instance variablesprotected TextBuilder _text;protected string _html;protected int _pos;// Static constructor (one time only)static HtmlToText(){_tags = new Dictionary<string, string>();_tags.Add("address", "\n");_tags.Add("blockquote", "\n");_tags.Add("div", "\n");_tags.Add("dl", "\n");_tags.Add("fieldset", "\n");_tags.Add("form", "\n");_tags.Add("h1", "\n");_tags.Add("/h1", "\n");_tags.Add("h2", "\n");_tags.Add("/h2", "\n");_tags.Add("h3", "\n");_tags.Add("/h3", "\n");_tags.Add("h4", "\n");_tags.Add("/h4", "\n");_tags.Add("h5", "\n");_tags.Add("/h5", "\n");_tags.Add("h6", "\n");_tags.Add("/h6", "\n");_tags.Add("p", "\n");_tags.Add("/p", "\n");_tags.Add("table", "\n");_tags.Add("/table", "\n");_tags.Add("ul", "\n");_tags.Add("/ul", "\n");_tags.Add("ol", "\n");_tags.Add("/ol", "\n");_tags.Add("/li", "\n");_tags.Add("br", "\n");_tags.Add("/td", "\t");_tags.Add("/tr", "\n");_tags.Add("/pre", "\n");_ignoreTags = new HashSet<string>();_ignoreTags.Add("script");_ignoreTags.Add("noscript");_ignoreTags.Add("style");_ignoreTags.Add("object");}/// <summary>/// Converts the given HTML to plain text and returns the result./// </summary>/// <param name="html">HTML to be converted</param>/// <returns>Resulting plain text</returns>public string Convert(string html){// Initialize state variables_text = new TextBuilder();_html = html;_pos = 0;// Process inputwhile (!EndOfText){if (Peek() == '<'){// HTML tagbool selfClosing;string tag = ParseTag(out selfClosing);// Handle special tag casesif (tag == "body"){// Discard content before <body>_text.Clear();}else if (tag == "/body"){// Discard content after </body>_pos = _html.Length;}else if (tag == "pre"){// Enter preformatted mode_text.Preformatted = true;EatWhitespaceToNextLine();}else if (tag == "/pre"){// Exit preformatted mode_text.Preformatted = false;}string value;if (_tags.TryGetValue(tag, out value))_text.Write(value);if (_ignoreTags.Contains(tag))EatInnerContent(tag);}else if (Char.IsWhiteSpace(Peek())){// Whitespace (treat all as space)_text.Write(_text.Preformatted ? Peek() : ' ');MoveAhead();}else{// Other text_text.Write(Peek());MoveAhead();}}// Return resultreturn HttpUtility.HtmlDecode(_text.ToString());}// Eats all characters that are part of the current tag// and returns information about that tagprotected string ParseTag(out bool selfClosing){string tag = String.Empty;selfClosing = false;if (Peek() == '<'){MoveAhead();// Parse tag nameEatWhitespace();int start = _pos;if (Peek() == '/')MoveAhead();while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&Peek() != '/' && Peek() != '>')MoveAhead();tag = _html.Substring(start, _pos - start).ToLower();// Parse rest of tagwhile (!EndOfText && Peek() != '>'){if (Peek() == '"' || Peek() == '\'')EatQuotedValue();else{if (Peek() == '/')selfClosing = true;MoveAhead();}}MoveAhead();}return tag;}// Consumes inner content from the current tagprotected void EatInnerContent(string tag){string endTag = "/" + tag;while (!EndOfText){if (Peek() == '<'){// Consume a tagbool selfClosing;if (ParseTag(out selfClosing) == endTag)return;// Use recursion to consume nested tagsif (!selfClosing && !tag.StartsWith("/"))EatInnerContent(tag);}else MoveAhead();}}// Returns true if the current position is at the end of// the stringprotected bool EndOfText{get { return (_pos >= _html.Length); }}// Safely returns the character at the current positionprotected char Peek(){return (_pos < _html.Length) ? _html[_pos] : (char)0;}// Safely advances to current position to the next characterprotected void MoveAhead(){_pos = Math.Min(_pos + 1, _html.Length);}// Moves the current position to the next non-whitespace// character.protected void EatWhitespace(){while (Char.IsWhiteSpace(Peek()))MoveAhead();}// Moves the current position to the next non-whitespace// character or the start of the next line, whichever// comes firstprotected void EatWhitespaceToNextLine(){while (Char.IsWhiteSpace(Peek())){char c = Peek();MoveAhead();if (c == '\n')break;}}// Moves the current position past a quoted valueprotected void EatQuotedValue(){char c = Peek();if (c == '"' || c == '\''){// Opening quoteMoveAhead();// Find end of valueint start = _pos;_pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);if (_pos < 0)_pos = _html.Length;elseMoveAhead(); // Closing quote}}/// <summary>/// A StringBuilder class that helps eliminate excess whitespace./// </summary>protected class TextBuilder{private StringBuilder _text;private StringBuilder _currLine;private int _emptyLines;private bool _preformatted;// Constructionpublic TextBuilder(){_text = new StringBuilder();_currLine = new StringBuilder();_emptyLines = 0;_preformatted = false;}/// <summary>/// Normally, extra whitespace characters are discarded./// If this property is set to true, they are passed/// through unchanged./// </summary>public bool Preformatted{get{return _preformatted;}set{if (value){// Clear line buffer if changing to// preformatted modeif (_currLine.Length > 0)FlushCurrLine();_emptyLines = 0;}_preformatted = value;}}/// <summary>/// Clears all current text./// </summary>public void Clear(){_text.Length = 0;_currLine.Length = 0;_emptyLines = 0;}/// <summary>/// Writes the given string to the output buffer./// </summary>/// <param name="s"></param>public void Write(string s){foreach (char c in s)Write(c);}/// <summary>/// Writes the given character to the output buffer./// </summary>/// <param name="c">Character to write</param>public void Write(char c){if (_preformatted){// Write preformatted character_text.Append(c);}else{if (c == '\r'){// Ignore carriage returns. We'll process// '\n' if it comes next}else if (c == '\n'){// Flush current lineFlushCurrLine();}else if (Char.IsWhiteSpace(c)){// Write single space characterint len = _currLine.Length;if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))_currLine.Append(' ');}else{// Add character to current line_currLine.Append(c);}}}// Appends the current line to output bufferprotected void FlushCurrLine(){// Get current linestring line = _currLine.ToString().Trim();// Determine if line contains non-space charactersstring tmp = line.Replace(" ", String.Empty);if (tmp.Length == 0){// An empty line_emptyLines++;if (_emptyLines < 2 && _text.Length > 0)_text.AppendLine(line);}else{// A non-empty line_emptyLines = 0;_text.AppendLine(line);}// Reset current line_currLine.Length = 0;}/// <summary>/// Returns the current output as a string./// </summary>public override string ToString(){if (_currLine.Length > 0)FlushCurrLine();return _text.ToString();}} }//2、提取html的正文 類 using System;using System.Text;namespace HtmlStrip{class MainClass{public static void Main (string[] args){string str = "<div>abc</div><span>efg</span><br /><script>888</script><!--<PA>WW</PA-->oo";//System.IO.StreamReader rd=new System.IO.StreamReader ("/home/lx/test.html");//str=rd.ReadToEnd ();HtmlParser t = new HtmlParser (str); //t.KeepTag (new string[] { "br" }); //設(shè)置br標(biāo)簽不過慮Console.Write (t.Text ());}}class HtmlParser{private string[] htmlcode; //把html轉(zhuǎn)為數(shù)組形式用于分析private StringBuilder result = new StringBuilder (); //輸出的結(jié)果private int seek; //分析文本時候的指針位置private string[] keepTag; //用于保存要保留的尖括號內(nèi)容private bool _inTag; //標(biāo)記現(xiàn)在的指針是不是在尖括號內(nèi)private bool needContent = true; //是否要提取正文private string tagName; //當(dāng)前尖括號的名字private string[] specialTag = new string[] { "script", "style", "!--" }; //特殊的尖括號內(nèi)容,一般這些標(biāo)簽的正文是不要的/// <summary>/// 當(dāng)指針進入尖括號內(nèi),就會觸發(fā)這個屬性。這里主要邏輯是提取尖括號里的標(biāo)簽名字/// </summary>public bool inTag {get { return _inTag; }set {_inTag = value;if (!value)return;bool ok = true;tagName = "";while (ok) {string word = read ();if (word != " " && word != ">") {tagName += word;} else if (word == " " && tagName.Length > 0) {ok = false;} else if (word == ">") {ok = false;inTag = false;seek -= 1;}}}}/// <summary>/// 初始化類/// </summary>/// <param name="html">/// 要分析的html代碼/// </param>public HtmlParser (string html){htmlcode = new string[html.Length];for (int i = 0; i < html.Length; i++) {htmlcode[i] = html[i].ToString ();}KeepTag (new string[] { });}/// <summary>/// 設(shè)置要保存那些標(biāo)簽不要被過濾掉/// </summary>/// <param name="tags">////// </param>public void KeepTag (string[] tags){keepTag = tags;}/// <summary>/// /// </summary>/// <returns>/// 輸出處理后的文本/// </returns>public string Text (){int startTag = 0;int endTag = 0;while (seek < htmlcode.Length) {string word = read ();if (word.ToLower () == "<") {startTag = seek;inTag = true;} else if (word.ToLower () == ">") {endTag = seek;inTag = false;if (iskeepTag (tagName.Replace ("/", ""))) {for (int i = startTag - 1; i < endTag; i++) {result.Append (htmlcode[i].ToString ());}} else if (tagName.StartsWith ("!--")) {bool ok = true;while (ok) {if (read () == "-") {if (read () == "-") {if (read () == ">") {ok = false;} else {seek -= 1;}}}}} else {foreach (string str in specialTag) {if (tagName == str) {needContent = false;break;} elseneedContent = true;}}} else if (!inTag && needContent) {result.Append (word);}}return result.ToString ();}/// <summary>/// 判斷是否要保存這個標(biāo)簽/// </summary>/// <param name="tag">/// A <see cref="System.String"/>/// </param>/// <returns>/// A <see cref="System.Boolean"/>/// </returns>private bool iskeepTag (string tag){foreach (string ta in keepTag) {if (tag.ToLower () == ta.ToLower ()) {return true;}}return false;}private string read (){return htmlcode[seek++];}}}

  ===========該文轉(zhuǎn)自=========

http://blog.csdn.net/cjh200102/article/details/6824895#

================================

轉(zhuǎn)載于:https://www.cnblogs.com/zjw520/archive/2013/04/11/3014848.html

總結(jié)

以上是生活随笔為你收集整理的C#---HTML 转文本及HTML内容提取的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯,歡迎將生活随笔推薦給好友。

主站蜘蛛池模板: 雪花飘电影在线观看免费高清 | av观看在线免费 | 91av视频 | 91视频www | 少妇精品亚洲一区二区成人 | 欧美呦交 | 伊人88| 啪啪av网站 | 少妇裸体淫交视频免费看高清 | 日本在线中文字幕专区 | 四虎国产在线观看 | 成人免费网站黄 | 伊人影片 | 手机av中文字幕 | 免费在线观看黄色av | 一区二区三区不卡视频在线观看 | 成人一区二区精品 | 男人猛进女人爽的大叫 | 2019年中文字幕| 久久三区| 夜夜操女人 | 岛国片在线播放 | av免费网站 | 中文在线免费看视频 | 亚洲国产精品一区二区久久hs | 三级黄色网 | 中文字幕一区二区三区乱码 | 韩国三级中文字幕 | 热久久国产精品 | 黄色第一网站 | 欧美自拍亚洲 | av影院在线观看 | 黑人干亚洲女人 | 不卡的日韩av | 日本毛片在线看 | 4438成人网| 天天干狠狠爱 | 亚洲美女性视频 | 色婷婷综合久久久久中文字幕 | 黄色资源在线播放 | 久久久电影 | 国产黄色大片免费看 | 性欧美欧美巨大69 | 日日碰狠狠添天天爽无码 | 中文字幕第22页 | 美女毛片在线观看 | 国产一区免费视频 | 老头糟蹋新婚少妇系列小说 | 爱爱的网站 | 欧美精品第1页 | 国产原创在线 | 天天干天天噜 | 欧美高清一区 | 无码精品a∨在线观看中文 福利片av | 黄色网址在线免费观看 | 日本激情视频一区二区三区 | 亚洲区小说区图片区qvod | www.伊人.com| 糖心av | 91在线视频网址 | 我要看18毛片 | 久久久剧场 | 精品不卡一区 | 久久国产精品久久久 | 国产日韩视频在线 | 久久99成人 | 98国产精品 | 橹图极品美女无圣光 | 懂色av,蜜臀av粉嫩av | 亚洲av无码一区二区三区网站 | 亚洲搞av| 中国成人毛片 | 免费的黄色av | 99久久精| 夜夜激情网 | 日本午夜一区二区 | 免费一级淫片aaa片毛片a级 | 国产一区二区在线看 | 久久久看| 亚洲精品久久久久久无码色欲四季 | 老色批永久免费网站www | 成人va在线观看 | 精品国产乱码久久久久久婷婷 | 午夜在线影院 | 91成人福利视频 | 欧美激情视频一区二区三区在线播放 | 中文字幕在线观看第二页 | 看全色黄大色黄大片女一次牛 | 国产精品果冻传媒 | 手机在线播放av | 国产精品成人av性教育 | 深夜福利视频在线 | 亚洲蜜臀av乱码久久精品蜜桃 | 亚洲欧美校园春色 | 麻豆传谋在线观看免费mv | 欧美a√| 国产视频一二区 | 久久久久久中文字幕 | 人成在线|