///
1<summary>
2/// 去除HTML标记
3/// </summary>
///
1<param name="strHtml"/>
包括HTML的源码
///
1<returns>已经去除后的文字</returns>
public static string StripHTML(string strHtml)
{
string [] aryReg ={
@"
1<script[^>]*?>.*?",
2
3@"<(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?>",
4@"([\r\n])[\s]+",
5@"&(quot|#34);",
6@"&(amp|#38);",
7@"&(lt|#60);",
8@"&(gt|#62);",
9@"&(nbsp|#160);",
10@"&(iexcl|#161);",
11@"&(cent|#162);",
12@"&(pound|#163);",
13@"&(copy|#169);",
14@"&#(\d+);",
15@"-->",
16@"<!--.*\n"
17
18};
19
20string [] aryRep = {
21"",
22"",
23"",
24"\"",
25"&",
26"<",
27">",
28" ",
29"\xa1",//chr(161),
30"\xa2",//chr(162),
31"\xa3",//chr(163),
32"\xa9",//chr(169),
33"",
34"\r\n",
35""
36};
37
38string newReg =aryReg[0];
39string strOutput=strHtml;
40for(int i = 0;i<aryreg.length;i++) );="" regex="new" regex(aryreg[i],regexoptions.ignorecase="" stroutput="regex.Replace(strOutput,aryRep[i]);" stroutput.replace("="" stroutput.replace("<","");="" {="" }="">","");
41strOutput.Replace("\r\n","");
42
43
44return strOutput;
45}
46转自: http://goaler.xicp.net/ShowLog.asp?ID=477</aryreg.length;i++)></script[^>