| 注册
请输入搜索内容

热门搜索

Java Linux MySQL PHP JavaScript Hibernate jQuery Nginx
jopen
10年前发布

C# 将 HTML 转成纯文本

/// <summary>  /// Converts HTML to plain text.  /// </summary>  class HtmlToText  {      // Static data tables      protected static Dictionary<string, string> _tags;      protected static HashSet<string> _ignoreTags;        // Instance variables      protected TextBuilder _text;      protected string _html;      protected int _pos;        // Static constructor (one time only)      static HtmlToText()      {          _tags = new Dictionary<string, string>();          _tags.Add("address", "\n");          _tags.Add("blockquote", "\n");          _tags.Add("div", "\n");          _tags.Add("dl", "\n");          _tags.Add("fieldset", "\n");          _tags.Add("form", "\n");          _tags.Add("h1", "\n");          _tags.Add("/h1", "\n");          _tags.Add("h2", "\n");          _tags.Add("/h2", "\n");          _tags.Add("h3", "\n");          _tags.Add("/h3", "\n");          _tags.Add("h4", "\n");          _tags.Add("/h4", "\n");          _tags.Add("h5", "\n");          _tags.Add("/h5", "\n");          _tags.Add("h6", "\n");          _tags.Add("/h6", "\n");          _tags.Add("p", "\n");          _tags.Add("/p", "\n");          _tags.Add("table", "\n");          _tags.Add("/table", "\n");          _tags.Add("ul", "\n");          _tags.Add("/ul", "\n");          _tags.Add("ol", "\n");          _tags.Add("/ol", "\n");          _tags.Add("/li", "\n");          _tags.Add("br", "\n");          _tags.Add("/td", "\t");          _tags.Add("/tr", "\n");          _tags.Add("/pre", "\n");            _ignoreTags = new HashSet<string>();          _ignoreTags.Add("script");          _ignoreTags.Add("noscript");          _ignoreTags.Add("style");          _ignoreTags.Add("object");      }        /// <summary>      /// Converts the given HTML to plain text and returns the result.      /// </summary>      /// <param name="html">HTML to be converted</param>      /// <returns>Resulting plain text</returns>      public string Convert(string html)      {          // Initialize state variables          _text = new TextBuilder();          _html = html;          _pos = 0;            // Process input          while (!EndOfText)          {              if (Peek() == '<')              {                  // HTML tag                  bool selfClosing;                  string tag = ParseTag(out selfClosing);                    // Handle special tag cases                  if (tag == "body")                  {                      // Discard content before <body>                      _text.Clear();                  }                  else if (tag == "/body")                  {                      // Discard content after </body>                      _pos = _html.Length;                  }                  else if (tag == "pre")                  {                      // Enter preformatted mode                      _text.Preformatted = true;                      EatWhitespaceToNextLine();                  }                  else if (tag == "/pre")                  {                      // Exit preformatted mode                      _text.Preformatted = false;                  }                    string value;                  if (_tags.TryGetValue(tag, out value))                      _text.Write(value);                    if (_ignoreTags.Contains(tag))                      EatInnerContent(tag);              }              else if (Char.IsWhiteSpace(Peek()))              {                  // Whitespace (treat all as space)                  _text.Write(_text.Preformatted ? Peek() : ' ');                  MoveAhead();              }              else              {                  // Other text                  _text.Write(Peek());                  MoveAhead();              }          }          // Return result          return HttpUtility.HtmlDecode(_text.ToString());      }        // Eats all characters that are part of the current tag      // and returns information about that tag      protected string ParseTag(out bool selfClosing)      {          string tag = String.Empty;          selfClosing = false;            if (Peek() == '<')          {              MoveAhead();                // Parse tag name              EatWhitespace();              int start = _pos;              if (Peek() == '/')                  MoveAhead();              while (!EndOfText && !Char.IsWhiteSpace(Peek()) &&                  Peek() != '/' && Peek() != '>')                  MoveAhead();              tag = _html.Substring(start, _pos - start).ToLower();                // Parse rest of tag              while (!EndOfText && Peek() != '>')              {                  if (Peek() == '"' || Peek() == '\'')                      EatQuotedValue();                  else                  {                      if (Peek() == '/')                          selfClosing = true;                      MoveAhead();                  }              }              MoveAhead();          }          return tag;      }        // Consumes inner content from the current tag      protected void EatInnerContent(string tag)      {          string endTag = "/" + tag;            while (!EndOfText)          {              if (Peek() == '<')              {                  // Consume a tag                  bool selfClosing;                  if (ParseTag(out selfClosing) == endTag)                      return;                  // Use recursion to consume nested tags                  if (!selfClosing && !tag.StartsWith("/"))                      EatInnerContent(tag);              }              else MoveAhead();          }      }        // Returns true if the current position is at the end of      // the string      protected bool EndOfText      {          get { return (_pos >= _html.Length); }      }        // Safely returns the character at the current position      protected char Peek()      {          return (_pos < _html.Length) ? _html[_pos] : (char)0;      }        // Safely advances to current position to the next character      protected void MoveAhead()      {          _pos = Math.Min(_pos + 1, _html.Length);      }        // Moves the current position to the next non-whitespace      // character.      protected void EatWhitespace()      {          while (Char.IsWhiteSpace(Peek()))              MoveAhead();      }        // Moves the current position to the next non-whitespace      // character or the start of the next line, whichever      // comes first      protected void EatWhitespaceToNextLine()      {          while (Char.IsWhiteSpace(Peek()))          {              char c = Peek();              MoveAhead();              if (c == '\n')                  break;          }      }        // Moves the current position past a quoted value      protected void EatQuotedValue()      {          char c = Peek();          if (c == '"' || c == '\'')          {              // Opening quote              MoveAhead();              // Find end of value              int start = _pos;              _pos = _html.IndexOfAny(new char[] { c, '\r', '\n' }, _pos);              if (_pos < 0)                  _pos = _html.Length;              else                  MoveAhead();    // Closing quote          }      }        /// <summary>      /// A StringBuilder class that helps eliminate excess whitespace.      /// </summary>      protected class TextBuilder      {          private StringBuilder _text;          private StringBuilder _currLine;          private int _emptyLines;          private bool _preformatted;            // Construction          public TextBuilder()          {              _text = new StringBuilder();              _currLine = new StringBuilder();              _emptyLines = 0;              _preformatted = false;          }            /// <summary>          /// Normally, extra whitespace characters are discarded.          /// If this property is set to true, they are passed          /// through unchanged.          /// </summary>          public bool Preformatted          {              get              {                  return _preformatted;              }              set              {                  if (value)                  {                      // Clear line buffer if changing to                      // preformatted mode                      if (_currLine.Length > 0)                          FlushCurrLine();                      _emptyLines = 0;                  }                  _preformatted = value;              }          }            /// <summary>          /// Clears all current text.          /// </summary>          public void Clear()          {              _text.Length = 0;              _currLine.Length = 0;              _emptyLines = 0;          }            /// <summary>          /// Writes the given string to the output buffer.          /// </summary>          /// <param name="s"></param>          public void Write(string s)          {              foreach (char c in s)                  Write(c);          }            /// <summary>          /// Writes the given character to the output buffer.          /// </summary>          /// <param name="c">Character to write</param>          public void Write(char c)          {              if (_preformatted)              {                  // Write preformatted character                  _text.Append(c);              }              else              {                  if (c == '\r')                  {                      // Ignore carriage returns. We'll process                      // '\n' if it comes next                  }                  else if (c == '\n')                  {                      // Flush current line                      FlushCurrLine();                  }                  else if (Char.IsWhiteSpace(c))                  {                      // Write single space character                      int len = _currLine.Length;                      if (len == 0 || !Char.IsWhiteSpace(_currLine[len - 1]))                          _currLine.Append(' ');                  }                  else                  {                      // Add character to current line                      _currLine.Append(c);                  }              }          }            // Appends the current line to output buffer          protected void FlushCurrLine()          {              // Get current line              string line = _currLine.ToString().Trim();                // Determine if line contains non-space characters              string tmp = line.Replace(" ", String.Empty);              if (tmp.Length == 0)              {                  // An empty line                  _emptyLines++;                  if (_emptyLines < 2 && _text.Length > 0)                      _text.AppendLine(line);              }              else              {                  // A non-empty line                  _emptyLines = 0;                  _text.AppendLine(line);              }                // Reset current line              _currLine.Length = 0;          }            /// <summary>          /// Returns the current output as a string.          /// </summary>          public override string ToString()          {              if (_currLine.Length > 0)                  FlushCurrLine();              return _text.ToString();          }      }  }

使用方法
HtmlToText convert = new HtmlToText();  textBox2.Text = convert.Convert(textBox1.Text);