利用正则表达式去掉html代码

9/18/2006来源:ASP.NET技巧人气:15031

using System.Text.RegularExPRessions;//需要引用

  // 利用正则表达式去掉"<"和">"之间的内容
  private string StripHT(string strHtml)
  {
   Regex regex=new Regex("<.+?>",RegexOptions.IgnoreCase);
   string strOutput=regex.Replace(strHtml,"");
   return strOutput;
  }


//方法二(不知为什么此方法占用CPU100%)

public static string DropHTML(string strHtml)
  {
   string [] aryReg ={
          @"<script[^>]*?>.*?</script>",
          @"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""''])(\\[""''tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
          @"([\r])[\s]+",
          @"&(quot|#34);",
          @"&(amp|#38);",
          @"&(lt|#60);",
          @"&(gt|#62);",
          @"&(nbsp|#160);",
          @"&(iexcl|#161);",
          @"&(cent|#162);",
          @"&(pound|#163);",
          @"&(copy|#169);",
          @"&#(\d+);",
          @"-->",
          @"<!--.*"        
         };

   string [] aryRep = {
           "",
           "",
           "",
           "\"",
           "&",
           "<",
           ">",
           " ",
           "\xa1",//chr(161),
           "\xa2",//chr(162),
           "\xa3",//chr(163),
           "\xa9",//chr(169),
           "",
           "\r",
           ""   
          };

   string newReg =aryReg[0];
   string strOutput=strHtml;
   for(int i = 0;i<aryReg.Length;i++)
   {
    Regex regex = new Regex(aryReg[i],RegexOptions.IgnoreCase );
    strOutput = regex.Replace(strOutput,aryRep[i]);
   }

   strOutput.Replace("<","");
   strOutput.Replace(">","");
   strOutput.Replace("\r","");
   return strOutput;
     
  }
http://www.cnblogs.com/wang123/archive/2006/09/16/505758.html