怎么样通过c#抓取网页分析?
怎么样通过c#抓取网页分析?
c#抓取网页分析 目的: 抓取网页,分析网页内容,进行处理获取信息。 例子: 抓km169上的adsl用户的费用信息,分析存储到本地数据库。 步骤:1、抓龋2、分析。3、存储。 |
|
||
1抓取 public string GetPage(string url, string postData, out string err) { err = ""; Stream outstream = null; Stream instream = null; StreamReader sr = null; HttpWebResponse response = null; HttpWebRequest request = null; Encoding encoding = Encoding.Default; byte[] data = encoding.GetBytes(postData); // 准备请求... try { // 设置参数 request = WebRequest.Create(url) as HttpWebRequest; CookieContainer cookieContainer = new CookieContainer(); request.CookieContainer = cookieContainer; request.AllowAutoRedirect = true; request.Method = "POST"; request.ContentType = "application/x-www-form-urlencoded"; request.ContentLength = data.Length; outstream = request.GetRequestStream(); outstream.Write(data, 0, data.Length); outstream.Close(); //发送请求并获取相应回应数据 response = request.GetResponse() as HttpWebResponse; //直到request.GetResponse()程序才开始向目标网页发送Post请求 instream = response.GetResponseStream(); sr = new StreamReader(instream, encoding); //返回结果网页(html)代码 string content = sr.ReadToEnd(); err = string.Empty; return content; } catch (Exception ex) { err = ex.Message; return string.Empty; } } [[i] Last edited by 王暴徒 on 2006-2-13 at 13:49 [/i]] |
|
||
2、分析 public string Get() { string str = GetPage(KMADSLURL, strReq, out err); Regex rgx = new Regex("table_det//(//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/",//n*//s*/"([^/"]*)/"//);", RegexOptions.Singleline); foreach (Match m in rgx.Matches(str)) { Rec r = new Rec(); r.str1 = m.Groups[1].Value; r.Save(); } return null; } 此处的关键在于正则表达式,利用匹配关系获得一条条记录,再用%1~%9分组,得到每个字段的内容,最后生成相应的记录即可(拼sql也可),这里用了个持久化的咚咚,下次详细说。 正则技巧:用^(间隔符号)来划分字段,:)不大好解释,大家自己体会下吧。 [[i] Last edited by 王暴徒 on 2006-2-13 at 13:58 [/i]] |
|
||||||||||||||||||||||||
我以前写了个多线程批量下载歌曲的程序,当时程序考虑的是挂接百渡,同时又预留了扩展性,比如通过配置也可以获取雅虎的歌曲,这就好考虑到个网站网页的编码方式,和暴徒的一样,我也是用了HttpWebResponse 类.通过对各种编码的网页在2进制下面的观察,发现前2个字节不同,所以转换成STRING时候需要特殊处理,否则中文有乱码
|
|
||
namespace Mp3Crazy { using System; /// <summary> /// 记录下载的字节位置 /// </summary> public class DownLoadState { private string _FileName; private string _AttachmentName; private int _Position; private string _RequestURL; private string _ResponseURL; private int _Length; private byte[] _Data; public string FileName { get { return _FileName; } } public int Position { get { return _Position; } } public int Length { get { return _Length; } } public string AttachmentName { get { return _AttachmentName; } } public string RequestURL { get { return _RequestURL; } } public string ResponseURL { get { return _ResponseURL; } } public byte[] Data { get { return _Data; } } internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, byte[] Data) { this._FileName = FileName; this._RequestURL = RequestURL; this._ResponseURL = ResponseURL; this._AttachmentName = AttachmentName; this._Position = Position; this._Data = Data; this._Length = Length; } internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length, ThreadCallbackHandler tch) { this._RequestURL = RequestURL; this._ResponseURL = ResponseURL; this._FileName = FileName; this._AttachmentName = AttachmentName; this._Position = Position; this._Length = Length; this._ThreadCallback = tch; } internal DownLoadState(string RequestURL, string ResponseURL, string FileName, string AttachmentName, int Position, int Length) { this._RequestURL = RequestURL; this._ResponseURL = ResponseURL; this._FileName = FileName; this._AttachmentName = AttachmentName; this._Position = Position; this._Length = Length; } private ThreadCallbackHandler _ThreadCallback; public HttpWebClient httpWebClient { get { return this._hwc; } set { this._hwc = value; } } private HttpWebClient _hwc; internal void StartDownloadFileChunk() { if (this._ThreadCallback != null) { this._ThreadCallback(this._RequestURL, this._FileName, this._Position, this._Length); this._hwc.OnThreadProcess(""); } } } } |
|
||
/* .Net/C#: 实现支持断点续传多线程下载的工具类 * Reflector 了一下 System.Net.WebClient ,改写或增加了若干: * DownLoad、Upload 相关方法! * 增加了 DataReceive、ExceptionOccurrs事件 */ namespace Mp3Crazy { using System; using System.IO; using System.Net; using System.Text; using System.Security; using System.Threading; using System.Collections.Specialized; //委托代理线程的所执行的方法签名一致 public delegate void ThreadCallbackHandler(string S, string s, int I, int i); //异常处理动作 public enum ExceptionActions { Throw, CancelAll, Ignore, Retry } /// <summary> /// 支持断点续传多线程下载的类 /// </summary> public class HttpWebClient { public delegate void ExceptionEventHandler(HttpWebClient Sender, ExceptionEventArgs e); public event ExceptionEventHandler ExceptionOccurrs; //发生异常事件 public delegate void ThreadProcessEventHandler(HttpWebClient Sender, ThreadProcessEventArgs e); public event ThreadProcessEventHandler ThreadProcessEnd; //发生多线程处理完毕事件 private int _FileLength,_getLength; //下载文件的总大小 public int TimeOut=20000; public int SongID=0; public bool UrlParsed; public string FileName; public bool Free=true; public int RetryTimes; public int TBlocks=1,curBlock; public int FileLength { get { return _FileLength; } } public int GetLength { get { return _getLength; } } [[i] Last edited by Timothy on 2006-2-17 at 09:48 [/i]] |
|
||
/// <summary> /// 分块下载文件 /// </summary> /// <param name="Address">URL 地址</param> /// <param name="FileName">保存到本地的路径文件名</param> /// <param name="ChunksCount">块数,线程数</param> public void DownloadFile(string Address, string FileName, int ChunksCount) { int p = 0; // position int s = 0; // chunk size _getLength=0; string a = null; HttpWebRequest hwrq; HttpWebResponse hwrp = null; try { hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address)); hwrq.Timeout=TimeOut; hwrp = (HttpWebResponse) hwrq.GetResponse(); //hwrq=null; long L = hwrp.ContentLength; hwrq.Credentials = this.m_credentials; L = ((L == -1) || (L > 0x7fffffff)) ? ((long) 0x7fffffff) : L; //Int32.MaxValue 该常数的值为 2,147,483,647; 即十六进制的 0x7FFFFFFF int l = (int) L; this._FileLength = l; bool b = true;//(hwrp.Headers["Accept-Ranges"] != null && hwrp.Headers["Accept-Ranges"] == "bytes"); a = hwrp.Headers["Content-Disposition"]; //attachment if (a != null) { a = a.Substring(a.LastIndexOf("filename=") + 9); } else { a = FileName; } int ss = s; if (b) { s = l / ChunksCount; if (s < 2 * 64 * 1024) //块大小至少为 128 K 字节 { s = 2 * 64 * 1024; } ss = s; int i = 0; while (l >= s) { l -= s; if (l < s) { s += l; } if (i++ > 0) { DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, p, s, new ThreadCallbackHandler(this.DownloadFileChunk)); // 单线程下载 // x.StartDownloadFileChunk(); x.httpWebClient = this; //多线程下载 Thread t = new Thread(new ThreadStart(x.StartDownloadFileChunk)); //this.OnThreadProcess(t); t.Start(); } p += s; } s = ss; this.ResponseAsBytes(Address, hwrp, s, FileName); this.OnThreadProcess(""); } } catch (Exception e) { if (this.ExceptionOccurrs != null) { string path=""; if(hwrp!=null) path=hwrp.ResponseUri.AbsolutePath; DownLoadState x = new DownLoadState(Address,path, FileName, a, p, s); ExceptionEventArgs eea = new ExceptionEventArgs(e, x); ExceptionOccurrs(this, eea); } } } internal void OnThreadProcess(string id) { if (ThreadProcessEnd != null) { ThreadProcessEventArgs tpea = new ThreadProcessEventArgs(id); ThreadProcessEnd(this, tpea); } } /// <summary> /// 下载一个文件块,利用该方法可自行实现多线程断点续传 /// </summary> /// <param name="Address">URL 地址</param> /// <param name="FileName">保存到本地的路径文件名</param> /// <param name="Length">块大小</param> public void DownloadFileChunk(string Address, string FileName, int FromPosition, int Length) { HttpWebResponse hwrp = null; string a = null; try { //this._FileName = FileName; HttpWebRequest hwrq = (HttpWebRequest) WebRequest.Create(this.GetUri(Address)); //hwrq.Credentials = this.m_credentials; hwrq.AddRange(FromPosition); hwrp = (HttpWebResponse) hwrq.GetResponse(); hwrq=null; a = hwrp.Headers["Content-Disposition"]; //attachment if (a != null) { a = a.Substring(a.LastIndexOf("filename=") + 9); } else { a = FileName; } this.ResponseAsBytes(Address, hwrp, Length, FileName); } catch (Exception e) { if (this.ExceptionOccurrs != null) { DownLoadState x = new DownLoadState(Address, hwrp.ResponseUri.AbsolutePath, FileName, a, FromPosition, Length); ExceptionEventArgs eea = new ExceptionEventArgs(e, x); ExceptionOccurrs(this, eea); } } } internal void ResponseAsBytes(string RequestURL, WebResponse Response, long Length, string FileName) { string a = null; //AttachmentName int P = 0; //整个文件的位置指针 int num2 = 0; try { a = Response.Headers["Content-Disposition"]; //attachment if (a != null) { a = a.Substring(a.LastIndexOf("filename=") + 9); } int p = 0; //本块的位置指针 int num1=(int)Length; byte[] buffer1 = new byte[30000]; string s = Response.Headers["Content-Range"]; if (s != null) { s = s.Replace("bytes ", ""); s = s.Substring(0, s.IndexOf("-")); P = Convert.ToInt32(s); } Stream S = Response.GetResponseStream(); System.IO.FileStream sw = new System.IO.FileStream(FileName, System.IO.FileMode.OpenOrCreate, System.IO.FileAccess.ReadWrite, System.IO.FileShare.ReadWrite); //Console.WriteLine("P:{0}",P); do { num2 = S.Read(buffer1, 0,30000); if (num2 > 0) { sw.Position = P; &nb |