正则表达式过滤html
在项⽬中会经常⽤正则表达式过滤html,⽐如得到Body⾥⾯的内容,获取⽹页中的img,a标签,或者得到纯⽂本等等。下⾯的Demo 实现对Html的过滤
主要⽤到的类:
1、System.Text.RegularExpressions; // 正则表达
2、System.IO;  // IO流
3、System.Net; //
第⼀步:搭建简易前台页⾯
<form id="form1" runat="server">
<div>
⽬标源地址:<asp:TextBox ID="tbUrl" runat="server"></asp:TextBox></div>
<br />
<asp:TextBox runat="server" TextMode="MultiLine" Width="500px" Height="500px" ID="tbCode"></asp:TextBox>
<br />
<asp:Button ID="btnRetrieveAll" runat="server" Text="搜索整个Html源码" OnClick="btnRetrieveAll_Click" />
<asp:Button ID="btnRetrievePureTxt" runat="server" Text="搜索纯⽂本" OnClick="btnRetrievePureTxt_Click" />
<asp:Button ID="btnRetrieveLink" runat="server" Text="搜索链接标签" OnClick="btnRetrieveLink_Click" />
<asp:Button ID="btnRetrieveImg" runat="server" Text="搜索图⽚标签"
onclick="btnRetrieveImg_Click" />
<asp:Button ID="btnRetriveScript" runat="server" Text="搜索脚本"
onclick="btnRetriveScript_Click" />
</form> 
第⼆步:定义类级变量
string strUrl = String.Empty;
string strWholeHtml = string.Empty;
const string MsgPageRetrieveFailed = "对不起,⽹页运⾏失败!";
bool flgPageRetrieved = true;
第三步:根据⽬标源取⽬标html源码
///<summary>
///⽤WebRequest和WebRespond从SourcePage.aspx中检索完整的html代码
///我们把html代码的格式转换为uft-8.
///</summary>
/
//<param name="url"></param>
///<returns></returns>
public string GetWholeHtmlCode(string url)
{
string strHtml = string.Empty;
StreamReader strReader = null;
HttpWebResponse wrpContent = null;
try
{
HttpWebRequest wrqContent = (HttpWebRequest)WebRequest.Create(strUrl);
wrqContent.Timeout = 300000;
wrpContent = (HttpWebResponse)wrqContent.GetResponse();
if (wrpContent.StatusCode != HttpStatusCode.OK)
{
flgPageRetrieved = false;
strHtml = "对不起,⽹页运⾏失败";
}
if (wrpContent != null)
{
strReader = new StreamReader(wrpContent.GetResponseStream(), Encoding.GetEncoding("utf-8"));
strHtml = strReader.ReadToEnd();
}
}
catch (Exception e)
{
flgPageRetrieved = false;
strHtml = e.Message;
}
finally
{
if (strReader != null)
strReader.Close();
if (wrpContent != null)
wrpContent.Close();
}
return strHtml;
}
⽬标URL源html码
protected void btnRetrieveAll_Click(object sender, EventArgs e)
{
strUrl = TextBox1.Text;
strWholeHtml = this.GetWholeHtmlCode(strUrl);
if (flgPageRetrieved)
{
tbResult.Text = strWholeHtml;
正则化过滤器}
else
{
tbResult.Text = MsgPageRetrieveFailed;
}
}
Html源纯⽂本
///<summary>
///从html代码⾥搜索纯⽂本,这个纯⽂本只包括html的
/// Body标记.
/
//</summary>
///<param name="sender"></param>
///<param name="e"></param>
protected void btnRetrievePureText_Click(object sender, EventArgs e)
{
strWholeHtml = this.GetWholeHtmlCode(strUrl);
if (flgPageRetrieved)
{
string strRegexScript = @"(?m)<body[^>]*>(\w|\W)*?</body[^>]*>";
string strRegex = @"<[^>]*>";
string strMatchScript = string.Empty;
Match matchText = Regex.Match(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase);
strMatchScript = matchText.Groups[0].Value;
string strPureText = Regex.Replace(strMatchScript, strRegex, string.Empty, RegexOptions.IgnoreCase);
tbResult.Text = strPureText;
}
else
{
tbResult.Text = MsgPageRetrieveFailed;
}
}
获取脚本代码
///<summary>
///从html代码中检索脚本代码.
///</summary>
///<param name="sender"></param>
///<param name="e"></param>
protected void btnRetrieveSriptCode_Click(object sender, EventArgs e)
{
strWholeHtml = this.GetWholeHtmlCode(strUrl);
if (flgPageRetrieved)
{
string strRegexScript = @"(?m)<script[^>]*>(\w|\W)*?</script[^>]*>";
string strRegex = @"<[^>]*>";
string strMatchScript = string.Empty;
MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexScript, RegexOptions.IgnoreCase);
StringBuilder strbScriptList = new StringBuilder();
foreach (Match matchSingleScript in matchList)
{
string strSingleScriptText = Regex.Replace(matchSingleScript.Value, strRegex, string.Empty, RegexOptions.IgnoreCase);                    strbScriptList.Append(strSingleScriptText + "\r\n");
}
tbResult.Text = strbScriptList.ToString();
}
else
{
tbResult.Text = MsgPageRetrieveFailed;
}
}
获取图⽚img
///<summary>
///从html代码中检索图⽚信息.
///</summary>
///<param name="sender"></param>
///<param name="e"></param>
protected void btnRetrieveImage_Click(object sender, EventArgs e)
{
strWholeHtml = this.GetWholeHtmlCode(strUrl);
if (flgPageRetrieved)
{
string strRegexImg = @"(?is)<img.*?>";
MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexImg, RegexOptions.IgnoreCase);                StringBuilder strbImageList = new StringBuilder();
foreach (Match matchSingleImage in matchList)
{
strbImageList.Append(matchSingleImage.Value + "\r\n");
}
tbResult.Text = strbImageList.ToString();
}
else
{
tbResult.Text = MsgPageRetrieveFailed;
}
}
html链接
/// <summary>
/// 从html代码中检索链接.
/// </summary>
/
// <param name="sender"></param>
/// <param name="e"></param>
protected void btnRetrievelink_Click(object sender, EventArgs e)
{
strUrl = TextBox1.Text;
strWholeHtml = this.GetWholeHtmlCode(strUrl);
if (flgPageRetrieved)
{
string strRegexLink = @"(?is)<a .*?>";
MatchCollection matchList = Regex.Matches(strWholeHtml, strRegexLink, RegexOptions.IgnoreCase);                StringBuilder strbLinkList = new StringBuilder();
foreach (Match matchSingleLink in matchList)
{
strbLinkList.Append(matchSingleLink.Value + "\r\n");
}
tbResult.Text = strbLinkList.ToString();
}
else
{
tbResult.Text = MsgPageRetrieveFailed;
}
}
这个Demo能满⾜⼤多数的过滤Html 需求。

版权声明:本站内容均来自互联网,仅供演示用,请勿用于商业和其他非法用途。如果侵犯了您的权益请与我们联系QQ:729038198,我们将在24小时内删除。