2022年NET抓取和分析网页的类 .pdf
主要功能有:1、提取网页的纯文本,去所有html 标签和 javascript代码2、提取网页的链接,包括href 和 frame 及 iframe 3、提取网页的title等( 其它的标签可依此类推,正则是一样的) 4、可以实现简单的表单提交及cookie 保存第一部分using System; using System.Data; using System.Configuration; using System.Net; using System.IO; using System.Text; using System.Collections.Generic; using System.Text.RegularExpressions; using System.Threading; using System.Web; / / 网页类/ public class WebPage #region 私有成员 private Uri m_uri; /网址 private List m_links; /此网页上的链接 private string m_title; /此网页的标题 private string m_html; /此网页的 HTML代码 private string m_outstr; /此网页可输出的纯文本 private bool m_good; /此网页是否可用 private int m_pagesize; /此网页的大小 private static Dictionary webcookies = new Dictionary();/存放所有网页的Cookie private string m_post; /此网页的登陆页需要的POST数据 private string m_loginurl; /此网页的登陆页 #endregion #region 私有方法 / / 这私有方法从网页的HTML 代码中分析出链接信息 / 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 1 页,共 15 页 - - - - - - - - - / List private List getLinks() if (m_links.Count = 0) Regex regex = new Regex2; regex0 = new Regex(?m)+href=(|)?(?(s)+)(|)?*(?(w|W)*?)/, RegexOptions.Multiline | RegexOptions.IgnoreCase); regex1 = new Regex(+src=(|)?(?(s)+)(|)?*, RegexOptions.Multiline | RegexOptions.IgnoreCase); for (int i = 0; i 2; i+) Match match = regexi.Match(m_html); while (match.Success) try string url = new Uri(m_uri, match.Groupsurl.Value).AbsoluteUri; string text = ; if (i = 0) text = new Regex(+)|(s)|( )|&|, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match.Groupstext.Value, ); Link link = new Link(url, text); m_links.Add(link); catch(Exception ex)Console.WriteLine(ex.Message); ; match = match.NextMatch(); return m_links; / / 此私有方法从一段HTML文本中提取出一定字数的纯文本 / / HTML代码 / 提取从头数多少个字 / 是否要链接里面的字 / 纯文本 private string getFirstNchar(string instr, int firstN, bool withLink) 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 2 页,共 15 页 - - - - - - - - - if (m_outstr = ) m_outstr = instr.Clone() as string; m_outstr = new Regex(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, ); m_outstr = new Regex(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, ); m_outstr = new Regex(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, ); if (!withLink) m_outstr = new Regex(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(m_outstr, ); Regex objReg = new System.Text.RegularExpressions.Regex(+?)| , RegexOptions.Multiline | RegexOptions.IgnoreCase); m_outstr = objReg.Replace(m_outstr, ); Regex objReg2 = new System.Text.RegularExpressions.Regex(s )+, RegexOptions.Multiline | RegexOptions.IgnoreCase); m_outstr = objReg2.Replace(m_outstr, ); return m_outstr.Length firstN ? m_outstr.Substring(0, firstN) : m_outstr; / / 此私有方法返回一个IP 地址对应的无符号整数 / / IP地址 / private uint getuintFromIP(IPAddress x) Byte bt = x.GetAddressBytes(); uint i = (uint)(bt0 * 256 * 256 * 256); i += (uint)(bt1 * 256 * 256); i += (uint)(bt2 * 256); i += (uint)(bt3); return i; #endregion #region 公有文法 / / 此公有方法提取网页中一定字数的纯文本,包括链接文字 / / 字数 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 3 页,共 15 页 - - - - - - - - - / public string getContext(int firstN) return getFirstNchar(m_html, firstN, true); / / 此公有方法提取网页中一定字数的纯文本,不包括链接文字 / / / public string getContextWithOutLink(int firstN) return getFirstNchar(m_html, firstN, false); / / 此公有方法从本网页的链接中提取一定数量的链接,该链接的URL满足某正则式 / / 正则式 / 返回的链接的个数 / List public List getSpecialLinksByUrl(string pattern,int count) if(m_links.Count=0)getLinks(); List SpecialLinks = new List(); List.Enumerator i; i = m_links.GetEnumerator(); int cnt = 0; while (i.MoveNext() & cntcount) if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.url).Success) SpecialLinks.Add(i.Current); cnt+; return SpecialLinks; / / 此公有方法从本网页的链接中提取一定数量的链接,该链接的文字满足某正则式名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 4 页,共 15 页 - - - - - - - - - / / 正则式 / 返回的链接的个数 / List public List getSpecialLinksByText(string pattern,int count) if (m_links.Count = 0) getLinks(); List SpecialLinks = new List(); List.Enumerator i; i = m_links.GetEnumerator(); int cnt = 0; while (i.MoveNext() & cnt count) if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.text).Success) SpecialLinks.Add(i.Current); cnt+; return SpecialLinks; / / 此公有方法获得所有链接中在一定IP 范围的链接 / / 起始 IP / 终止 IP / public List getSpecialLinksByIP(string _ip_start, string _ip_end) IPAddress ip_start = IPAddress.Parse(_ip_start); IPAddress ip_end = IPAddress.Parse(_ip_end); if (m_links.Count = 0) getLinks(); List SpecialLinks = new List(); List.Enumerator i; i = m_links.GetEnumerator(); while (i.MoveNext() IPAddress ip; try ip = Dns.GetHostEntry(new Uri(i.Current.url).Host).AddressList0; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 5 页,共 15 页 - - - - - - - - - catch continue; if(getuintFromIP(ip)=getuintFromIP(ip_start) & getuintFromIP(ip)=getuintFromIP(ip_end) SpecialLinks.Add(i.Current); return SpecialLinks; / / 这公有方法提取本网页的纯文本中满足某正则式的文字 / / 正则式 / 返回文字 public string getSpecialWords(string pattern) if (m_outstr = ) getContext(Int16.MaxValue); Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ); Match mc=regex.Match(m_outstr); if (mc.Success) return mc.Groups1.Value; return string.Empty; #endregion #region 构造函数 private void Init(string _url) try m_uri = new Uri(_url); m_links = new List(); m_html = ; m_outstr = ; m_title = ; m_good = true; if (_url.EndsWith(.rar) | _url.EndsWith(.dat) | _url.EndsWith(.msi) m_good = false; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 6 页,共 15 页 - - - - - - - - - return; HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); rqst.AllowAutoRedirect = true; rqst.MaximumAutomaticRedirections = 3; rqst.UserAgent = Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0); rqst.KeepAlive = true; rqst.Timeout = 30000; lock (WebPage.webcookies) if (WebPage.webcookies.ContainsKey(m_uri.Host) rqst.CookieContainer = WebPage.webcookiesm_uri.Host; else CookieContainer cc = new CookieContainer(); WebPage.webcookiesm_uri.Host = cc; rqst.CookieContainer = cc; HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse(); Stream sm = rsps.GetResponseStream(); if (!rsps.ContentType.ToLower().StartsWith(text/) | rsps.ContentLength 1 22) rsps.Close(); m_good = false; return; Encoding cding = System.Text.Encoding.Default; string contenttype=rsps.ContentType.ToLower(); int ix = contenttype.IndexOf(charset=); if (ix != -1) try cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + charset.Length + 1); catch cding = Encoding.Default; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 7 页,共 15 页 - - - - - - - - - m_html = new StreamReader(sm, cding).ReadToEnd(); else m_html = new StreamReader(sm, cding).ReadToEnd(); Regex regex = new Regex(charset=(?=+)?,RegexOptions.IgnoreCase); string strcding = regex.Match(m_html).Groupscding.Value; try cding = Encoding.GetEncoding(strcding); catch cding = Encoding.Default; byte bytes=Encoding.Default.GetBytes(m_html.ToCharArray(); m_html = cding.GetString(bytes); if (m_html.Split(?).Length 100) m_html=Encoding.Default.GetString(bytes); m_pagesize = m_html.Length; m_uri = rsps.ResponseUri; rsps.Close(); catch (Exception ex) Console.WriteLine(ex.Message+m_uri.ToString(); m_good = false; 第二部分public WebPage(string _url) 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 8 页,共 15 页 - - - - - - - - - string uurl = ; try uurl = Uri.UnescapeDataString(_url); _url = uurl; catch ; Regex re = new Regex(?x00-xff+); Match mc = re.Match(_url); if (mc.Success) string han = mc.Groupsh.Value; _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding(GB2312); Init(_url); public WebPage(string _url, string _loginurl, string _post) string uurl = ; try uurl = Uri.UnescapeDataString(_url); _url = uurl; catch ; Regex re = new Regex(?x00-xff+); Match mc = re.Match(_url); if (mc.Success) string han = mc.Groupsh.Value; _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding(GB2312); if (_loginurl.Trim() = | _post.Trim() = | WebPage.webcookies.ContainsKey(new Uri(_url).Host) Init(_url); else #region 登陆 string indata = _post; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 9 页,共 15 页 - - - - - - - - - m_post = _post; m_loginurl = _loginurl; byte bytes = Encoding.Default.GetBytes(_post); CookieContainer myCookieContainer = new CookieContainer(); try /新建一个CookieContainer来存放 Cookie 集合 HttpWebRequest myHttpWebRequest = (HttpWebRequest)WebRequest.Create(_loginurl); /新建一个HttpWebRequest myHttpWebRequest.ContentType = application/x-www-form-urlencoded; myHttpWebRequest.AllowAutoRedirect = false; myHttpWebRequest.UserAgent = Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0); myHttpWebRequest.Timeout = 60000; myHttpWebRequest.KeepAlive = true; myHttpWebRequest.ContentLength = bytes.Length; myHttpWebRequest.Method = POST; myHttpWebRequest.CookieContainer = myCookieContainer; /设置 HttpWebRequest 的 CookieContainer为刚才建立的那个myCookieContainer Stream myRequestStream = myHttpWebRequest.GetRequestStream(); myRequestStream.Write(bytes, 0, bytes.Length); myRequestStream.Close(); HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse(); foreach (Cookie ck in myHttpWebResponse.Cookies) myCookieContainer.Add(ck); myHttpWebResponse.Close(); catch Init(_url); return; #endregion 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 10 页,共 15 页 - - - - - - - - - #region 登陆后再访问页面 try m_uri = new Uri(_url); m_links = new List(); m_html = ; m_outstr = ; m_title = ; m_good = true; if (_url.EndsWith(.rar) | _url.EndsWith(.dat) | _url.EndsWith(.msi) m_good = false; return; HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); rqst.AllowAutoRedirect = true; rqst.MaximumAutomaticRedirections = 3; rqst.UserAgent = Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0); rqst.KeepAlive = true; rqst.Timeout = 30000; rqst.CookieContainer = myCookieContainer; lock (WebPage.webcookies) WebPage.webcookiesm_uri.Host = myCookieContainer; HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse(); Stream sm = rsps.GetResponseStream(); if (!rsps.ContentType.ToLower().StartsWith(text/) | rsps.ContentLength 1 22) rsps.Close(); m_good = false; return; Encoding cding = System.Text.Encoding.Default; int ix = rsps.ContentType.ToLower().IndexOf(charset=); if (ix != -1) try cding = System.Text.Encoding.GetEncoding(rsps.ContentTy名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 11 页,共 15 页 - - - - - - - - - pe.Substring(ix + charset.Length + 1); catch cding = Encoding.Default; m_html = new StreamReader(sm, cding).ReadToEnd(); m_pagesize = m_html.Length; m_uri = rsps.ResponseUri; rsps.Close(); catch (Exception ex) Console.WriteLine(ex.Message+m_uri.ToString(); m_good = false; #endregion #endregion #region 属性 / / 通过此属性可获得本网页的网址,只读 / public string URL get return m_uri.AbsoluteUri; / / 通过此属性可获得本网页的标题,只读名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 12 页,共 15 页 - - - - - - - - - / public string Title get if (m_title = ) Regex reg = new Regex(?m)*(?(?:w|W)*?)*, RegexOptions.Multiline | RegexOptions.IgnoreCase ); Match mc = reg.Match(m_html); if (mc.Success) m_title= mc.Groupstitle.Value.Trim(); return m_title; / / 此属性获得本网页的所有链接信息,只读 / public List Links get if (m_links.Count = 0) getLinks(); return m_links; / / 此属性返回本网页的全部纯文本信息,只读 / public string Context get if (m_outstr = ) getContext(Int16.MaxValue); return m_outstr; / / 此属性获得本网页的大小名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 13 页,共 15 页 - - - - - - - - - / public int PageSize get return m_pagesize; / / 此属性获得本网页的所有站内链接 / public List InsiteLinks get return getSpecialLinksByUrl(http:/+m_uri.Host,Int16.MaxValue); / / 此属性表示本网页是否可用 / public bool IsGood get return m_good; / / 此属性表示网页的所在的网站 / public string Host get return m_uri.Host; / / 此网页的登陆页所需的POST数据 / public string PostStr 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 14 页,共 15 页 - - - - - - - - - get return m_post; / / 此网页的登陆页 / public string LoginURL get return m_loginurl; #endregion / / 链接类/ public class Link public string url; /链接网址 public string text; /链接文字 public Link(string _url, string _text) url = _url; text = _text; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 15 页,共 15 页 - - - - - - - - -