提取HTML代码中文字的C#函数

///

1<summary>   
2/// 去除HTML标记   
3/// </summary>

///

1<param name="strHtml"/>

包括HTML的源码
///

1<returns>已经去除后的文字</returns>

public static string StripHTML(string strHtml)
{
string [] aryReg ={
@"

 1<script[^>]*?&gt;.*?", 
 2
 3@"&lt;(\/\s*)?!?((\w+:)?\w+)(\w+(\s*=?\s*(([""'])(\\\\[""'tbnr]|[^\7])*?\7|\w+)|.{0})|\s)*?(\/\s*)?&gt;",   
 4@"([\r\n])[\s]+",   
 5@"&amp;(quot|#34);",   
 6@"&amp;(amp|#38);",   
 7@"&amp;(lt|#60);",   
 8@"&amp;(gt|#62);",   
 9@"&amp;(nbsp|#160);",   
10@"&amp;(iexcl|#161);",   
11@"&amp;(cent|#162);",   
12@"&amp;(pound|#163);",   
13@"&amp;(copy|#169);",   
14@"&amp;#(\d+);",   
15@"--&gt;",   
16@"&lt;!--.*\n"   
17  
18}; 
19
20string [] aryRep = {   
21"",   
22"",   
23"",   
24"\"",   
25"&amp;",   
26"&lt;",   
27"&gt;",   
28" ",   
29"\xa1",//chr(161),   
30"\xa2",//chr(162),   
31"\xa3",//chr(163),   
32"\xa9",//chr(169),   
33"",   
34"\r\n",   
35""   
36}; 
37
38string newReg =aryReg[0];   
39string strOutput=strHtml;   
40for(int i = 0;i<aryreg.length;i++) );="" regex="new" regex(aryreg[i],regexoptions.ignorecase="" stroutput="regex.Replace(strOutput,aryRep[i]);" stroutput.replace("="" stroutput.replace("<","");="" {="" }="">","");   
41strOutput.Replace("\r\n",""); 
42
43  
44return strOutput;   
45}   
46转自: http://goaler.xicp.net/ShowLog.asp?ID=477</aryreg.length;i++)></script[^>
Published At
Categories with Web编程
Tagged with
comments powered by Disqus