书签分享收藏举报版权申诉 / 15

立即下载

当前位置：首页 > 教育专区 > 高考资料 > 2022年NET抓取和分析网页的类 .pdf

2022年NET抓取和分析网页的类 .pdf

上传人：Che****ry

文档编号：27266790

上传时间：2022-07-23

格式：PDF

页数：15

大小：90.32KB

( 4.5 )

《2022年NET抓取和分析网页的类 .pdf》由会员分享，可在线阅读，更多相关《2022年NET抓取和分析网页的类 .pdf（15页珍藏版）》请在淘文阁 - 分享文档赚钱的网站上搜索。

1、主要功能有：1、提取网页的纯文本，去所有html 标签和 javascript代码2、提取网页的链接，包括href 和 frame 及 iframe 3、提取网页的title等( 其它的标签可依此类推，正则是一样的) 4、可以实现简单的表单提交及cookie 保存第一部分using System; using System.Data; using System.Configuration; using System.Net; using System.IO; using System.Text; using System.Collections.Generic; using System.Te

2、xt.RegularExpressions; using System.Threading; using System.Web; / / 网页类/ public class WebPage #region 私有成员 private Uri m_uri; /网址 private List m_links; /此网页上的链接 private string m_title; /此网页的标题 private string m_html; /此网页的 HTML代码 private string m_outstr; /此网页可输出的纯文本 private bool m_good; /此网页是否可用 pri

3、vate int m_pagesize; /此网页的大小 private static Dictionary webcookies = new Dictionary();/存放所有网页的Cookie private string m_post; /此网页的登陆页需要的POST数据 private string m_loginurl; /此网页的登陆页 #endregion #region 私有方法 / / 这私有方法从网页的HTML 代码中分析出链接信息 / 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - -

5、xOptions.IgnoreCase); for (int i = 0; i 2; i+) Match match = regexi.Match(m_html); while (match.Success) try string url = new Uri(m_uri, match.Groupsurl.Value).AbsoluteUri; string text = ; if (i = 0) text = new Regex(+)|(s)|( )|&|, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replace(match

6、.Groupstext.Value, ); Link link = new Link(url, text); m_links.Add(link); catch(Exception ex)Console.WriteLine(ex.Message); ; match = match.NextMatch(); return m_links; / / 此私有方法从一段HTML文本中提取出一定字数的纯文本 / / HTML代码 / 提取从头数多少个字 / 是否要链接里面的字 / 纯文本 private string getFirstNchar(string instr, int firstN, bool

7、 withLink) 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 2 页，共 15 页 - - - - - - - - - if (m_outstr = ) m_outstr = instr.Clone() as string; m_outstr = new Regex(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, ); m_outstr = new Regex

8、(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, ); m_outstr = new Regex(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Replace(m_outstr, ); if (!withLink) m_outstr = new Regex(?m)*(w|W)*?*, RegexOptions.Multiline | RegexOptions.IgnoreCase).Replac

9、e(m_outstr, ); Regex objReg = new System.Text.RegularExpressions.Regex(+?)| , RegexOptions.Multiline | RegexOptions.IgnoreCase); m_outstr = objReg.Replace(m_outstr, ); Regex objReg2 = new System.Text.RegularExpressions.Regex(s )+, RegexOptions.Multiline | RegexOptions.IgnoreCase); m_outstr = ob

10、jReg2.Replace(m_outstr, ); return m_outstr.Length firstN ? m_outstr.Substring(0, firstN) : m_outstr; / / 此私有方法返回一个IP 地址对应的无符号整数 / / IP地址 / private uint getuintFromIP(IPAddress x) Byte bt = x.GetAddressBytes(); uint i = (uint)(bt0 * 256 * 256 * 256); i += (uint)(bt1 * 256 * 256); i += (uint)(bt2 * 25

11、6); i += (uint)(bt3); return i; #endregion #region 公有文法 / / 此公有方法提取网页中一定字数的纯文本，包括链接文字 / / 字数名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 3 页，共 15 页 - - - - - - - - - / public string getContext(int firstN) return getFirstNchar(m_html, firstN, true); / / 此公有方法提取网页中一

12、定字数的纯文本，不包括链接文字 / / / public string getContextWithOutLink(int firstN) return getFirstNchar(m_html, firstN, false); / / 此公有方法从本网页的链接中提取一定数量的链接，该链接的URL满足某正则式 / / 正则式 / 返回的链接的个数 / List public List getSpecialLinksByUrl(string pattern,int count) if(m_links.Count=0)getLinks(); List SpecialLinks = new List

13、(); List.Enumerator i; i = m_links.GetEnumerator(); int cnt = 0; while (i.MoveNext() & cntcount) if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.url).Success) SpecialLinks.Add(i.Current); cnt+; return SpecialLinks; / / 此公有方法从本网页的链接中提取一定数量的链接，该链接的文字满足某正则式名师资料

14、总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 4 页，共 15 页 - - - - - - - - - / / 正则式 / 返回的链接的个数 / List public List getSpecialLinksByText(string pattern,int count) if (m_links.Count = 0) getLinks(); List SpecialLinks = new List(); List.Enumerator i; i = m_links.GetEnumerat

15、or(); int cnt = 0; while (i.MoveNext() & cnt count) if (new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ).Match(i.Current.text).Success) SpecialLinks.Add(i.Current); cnt+; return SpecialLinks; / / 此公有方法获得所有链接中在一定IP 范围的链接 / / 起始 IP / 终止 IP / public List getSpecialLinksByIP(string

16、_ip_start, string _ip_end) IPAddress ip_start = IPAddress.Parse(_ip_start); IPAddress ip_end = IPAddress.Parse(_ip_end); if (m_links.Count = 0) getLinks(); List SpecialLinks = new List(); List.Enumerator i; i = m_links.GetEnumerator(); while (i.MoveNext() IPAddress ip; try ip = Dns.GetHostEntry(new

17、Uri(i.Current.url).Host).AddressList0; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 5 页，共 15 页 - - - - - - - - - catch continue; if(getuintFromIP(ip)=getuintFromIP(ip_start) & getuintFromIP(ip)=getuintFromIP(ip_end) SpecialLinks.Add(i.Current); return SpecialLinks;

18、 / / 这公有方法提取本网页的纯文本中满足某正则式的文字 / / 正则式 / 返回文字 public string getSpecialWords(string pattern) if (m_outstr = ) getContext(Int16.MaxValue); Regex regex = new Regex(pattern, RegexOptions.Multiline | RegexOptions.IgnoreCase ); Match mc=regex.Match(m_outstr); if (mc.Success) return mc.Groups1.Value; return

19、 string.Empty; #endregion #region 构造函数 private void Init(string _url) try m_uri = new Uri(_url); m_links = new List(); m_html = ; m_outstr = ; m_title = ; m_good = true; if (_url.EndsWith(.rar) | _url.EndsWith(.dat) | _url.EndsWith(.msi) m_good = false; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - -

20、 - - - - - 名师精心整理 - - - - - - - 第 6 页，共 15 页 - - - - - - - - - return; HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); rqst.AllowAutoRedirect = true; rqst.MaximumAutomaticRedirections = 3; rqst.UserAgent = Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0); rqst.KeepAlive = true; rq

21、st.Timeout = 30000; lock (WebPage.webcookies) if (WebPage.webcookies.ContainsKey(m_uri.Host) rqst.CookieContainer = WebPage.webcookiesm_uri.Host; else CookieContainer cc = new CookieContainer(); WebPage.webcookiesm_uri.Host = cc; rqst.CookieContainer = cc; HttpWebResponse rsps = (HttpWebResponse)rqs

22、t.GetResponse(); Stream sm = rsps.GetResponseStream(); if (!rsps.ContentType.ToLower().StartsWith(text/) | rsps.ContentLength 1 22) rsps.Close(); m_good = false; return; Encoding cding = System.Text.Encoding.Default; string contenttype=rsps.ContentType.ToLower(); int ix = contenttype.IndexOf(charset

23、=); if (ix != -1) try cding = System.Text.Encoding.GetEncoding(rsps.ContentType.Substring(ix + charset.Length + 1); catch cding = Encoding.Default; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 7 页，共 15 页 - - - - - - - - - m_html = new StreamReader(sm, cding).ReadTo

24、End(); else m_html = new StreamReader(sm, cding).ReadToEnd(); Regex regex = new Regex(charset=(?=+)?,RegexOptions.IgnoreCase); string strcding = regex.Match(m_html).Groupscding.Value; try cding = Encoding.GetEncoding(strcding); catch cding = Encoding.Default; byte bytes=Encoding.Default.GetBytes(m_h

25、tml.ToCharArray(); m_html = cding.GetString(bytes); if (m_html.Split(?).Length 100) m_html=Encoding.Default.GetString(bytes); m_pagesize = m_html.Length; m_uri = rsps.ResponseUri; rsps.Close(); catch (Exception ex) Console.WriteLine(ex.Message+m_uri.ToString(); m_good = false; 第二部分public WebPage(str

26、ing _url) 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 8 页，共 15 页 - - - - - - - - - string uurl = ; try uurl = Uri.UnescapeDataString(_url); _url = uurl; catch ; Regex re = new Regex(?x00-xff+); Match mc = re.Match(_url); if (mc.Success) string han = mc.Groupsh.Val

27、ue; _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding(GB2312); Init(_url); public WebPage(string _url, string _loginurl, string _post) string uurl = ; try uurl = Uri.UnescapeDataString(_url); _url = uurl; catch ; Regex re = new Regex(?x00-xff+); Match mc = re.Match(

28、_url); if (mc.Success) string han = mc.Groupsh.Value; _url = _url.Replace(han, System.Web.HttpUtility.UrlEncode(han, Encoding.GetEncoding(GB2312); if (_loginurl.Trim() = | _post.Trim() = | WebPage.webcookies.ContainsKey(new Uri(_url).Host) Init(_url); else #region 登陆 string indata = _post; 名师资料总结 -

29、- -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 9 页，共 15 页 - - - - - - - - - m_post = _post; m_loginurl = _loginurl; byte bytes = Encoding.Default.GetBytes(_post); CookieContainer myCookieContainer = new CookieContainer(); try /新建一个CookieContainer来存放 Cookie 集合 HttpWebRequest m

30、yHttpWebRequest = (HttpWebRequest)WebRequest.Create(_loginurl); /新建一个HttpWebRequest myHttpWebRequest.ContentType = application/x-www-form-urlencoded; myHttpWebRequest.AllowAutoRedirect = false; myHttpWebRequest.UserAgent = Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0); myHttpWebRequest.Timeout

31、 = 60000; myHttpWebRequest.KeepAlive = true; myHttpWebRequest.ContentLength = bytes.Length; myHttpWebRequest.Method = POST; myHttpWebRequest.CookieContainer = myCookieContainer; /设置 HttpWebRequest 的 CookieContainer为刚才建立的那个myCookieContainer Stream myRequestStream = myHttpWebRequest.GetRequestStream()

32、; myRequestStream.Write(bytes, 0, bytes.Length); myRequestStream.Close(); HttpWebResponse myHttpWebResponse = (HttpWebResponse)myHttpWebRequest.GetResponse(); foreach (Cookie ck in myHttpWebResponse.Cookies) myCookieContainer.Add(ck); myHttpWebResponse.Close(); catch Init(_url); return; #endregion 名

33、师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 10 页，共 15 页 - - - - - - - - - #region 登陆后再访问页面 try m_uri = new Uri(_url); m_links = new List(); m_html = ; m_outstr = ; m_title = ; m_good = true; if (_url.EndsWith(.rar) | _url.EndsWith(.dat) | _url.EndsWith(.msi) m_good

34、 = false; return; HttpWebRequest rqst = (HttpWebRequest)WebRequest.Create(m_uri); rqst.AllowAutoRedirect = true; rqst.MaximumAutomaticRedirections = 3; rqst.UserAgent = Mozilla/4.0 (compatible; MSIE 5.01; Windows NT 5.0); rqst.KeepAlive = true; rqst.Timeout = 30000; rqst.CookieContainer = myCookieCo

35、ntainer; lock (WebPage.webcookies) WebPage.webcookiesm_uri.Host = myCookieContainer; HttpWebResponse rsps = (HttpWebResponse)rqst.GetResponse(); Stream sm = rsps.GetResponseStream(); if (!rsps.ContentType.ToLower().StartsWith(text/) | rsps.ContentLength 1 22) rsps.Close(); m_good = false; return; En

36、coding cding = System.Text.Encoding.Default; int ix = rsps.ContentType.ToLower().IndexOf(charset=); if (ix != -1) try cding = System.Text.Encoding.GetEncoding(rsps.ContentTy名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 11 页，共 15 页 - - - - - - - - - pe.Substring(ix +

37、 charset.Length + 1); catch cding = Encoding.Default; m_html = new StreamReader(sm, cding).ReadToEnd(); m_pagesize = m_html.Length; m_uri = rsps.ResponseUri; rsps.Close(); catch (Exception ex) Console.WriteLine(ex.Message+m_uri.ToString(); m_good = false; #endregion #endregion #region 属性 / / 通过此属性可获

38、得本网页的网址，只读 / public string URL get return m_uri.AbsoluteUri; / / 通过此属性可获得本网页的标题，只读名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 12 页，共 15 页 - - - - - - - - - / public string Title get if (m_title = ) Regex reg = new Regex(?m)*(?(?:w|W)*?)*, RegexOptions.Multiline |

39、RegexOptions.IgnoreCase ); Match mc = reg.Match(m_html); if (mc.Success) m_title= mc.Groupstitle.Value.Trim(); return m_title; / / 此属性获得本网页的所有链接信息，只读 / public List Links get if (m_links.Count = 0) getLinks(); return m_links; / / 此属性返回本网页的全部纯文本信息，只读 / public string Context get if (m_outstr = ) getCon

40、text(Int16.MaxValue); return m_outstr; / / 此属性获得本网页的大小名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 13 页，共 15 页 - - - - - - - - - / public int PageSize get return m_pagesize; / / 此属性获得本网页的所有站内链接 / public List InsiteLinks get return getSpecialLinksByUrl(http:/+m_uri.

41、Host,Int16.MaxValue); / / 此属性表示本网页是否可用 / public bool IsGood get return m_good; / / 此属性表示网页的所在的网站 / public string Host get return m_uri.Host; / / 此网页的登陆页所需的POST数据 / public string PostStr 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 14 页，共 15 页 - - - - - - - - - get

42、return m_post; / / 此网页的登陆页 / public string LoginURL get return m_loginurl; #endregion / / 链接类/ public class Link public string url; /链接网址 public string text; /链接文字 public Link(string _url, string _text) url = _url; text = _text; 名师资料总结 - - -精品资料欢迎下载 - - - - - - - - - - - - - - - - - - 名师精心整理 - - - - - - - 第 15 页，共 15 页 - - - - - - - - -

文档加载中……请稍候！
如果长时间未打开，您也可以点击刷新试试。

下载文档到电脑，查找使用更方便

4.3 金币

版权申诉 word格式文档无特别注明外均可编辑修改；预览文档经过压缩，下载后原文更清晰！ 立即下载

配套讲稿：: 如PPT文件的首页显示word图标，表示该PPT已包含配套word讲稿。双击word图标可打开word文档。
特殊限制：: 部分文档作品中含有的国旗、国徽等图片，仅作为作品整体效果示例展示，禁止商用。设计者仅对作品中独创性部分享有著作权。
关键词：: 2022年NET抓取和分析网页的类 2022 NET 抓取分析网页

淘文阁 - 分享文档赚钱的网站所有资源均是用户自行上传分享，仅供网友学习交流，未经上传用户书面授权，请勿作他用。

限制150内

关于本文

本文标题：2022年NET抓取和分析网页的类 .pdf
链接地址：https://www.taowenge.com/p-27266790.html