日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程资源 > 综合教程 >内容正文

综合教程

网站数据采集程序(爬虫)

發布時間:2024/4/24 综合教程 33 生活家
生活随笔 收集整理的這篇文章主要介紹了 网站数据采集程序(爬虫) 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

采集數據無非就是三步,抓取頁面,分析數據,入庫。

一、抓取頁面

抓取頁面也是在網上找的例子,主要是用到了2個方法

1,獲取網站類容;2,清除html標簽。具體看代碼:

/// <summary>
        /// 根據Url獲得內容
        /// </summary>
        /// <param name="url">Url</param>
        /// <returns>string</returns>
        public string GetContentUrl(string url)
        {
            string htmlContent = string.Empty;
            try
            {
                System.Threading.Thread.Sleep(500);
                HttpWebRequest req = (HttpWebRequest)WebRequest.Create(url);
                //偽造瀏覽器數據,避免被防采集程序過濾
                req.UserAgent = "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.0; .NET CLR 1.1.4322; .NET CLR 2.0.50215; CrazyCoder.cn;www.aligong.com)";
                req.ReadWriteTimeout = 30000;
                req.Timeout = 300000;
                req.Proxy = null;
                HttpWebResponse response = (HttpWebResponse)req.GetResponse();
                using (StreamReader sr = new StreamReader(response.GetResponseStream()))
                {
                    htmlContent = sr.ReadToEnd();
                    sr.Dispose();
                    response.Close();
                }
            }
            catch
            {
                htmlContent = "";
            }
            return htmlContent;
        }

View Code

 1 /// <summary>
 2         /// 清除Html標簽
 3         /// </summary>
 4         /// <param name="ContentStr">Html內容</param>
 5         /// <returns>string</returns>
 6         public string ClearLable(string ContentStr)
 7         {
 8             while (ContentStr.IndexOf('<') >= 0 && ContentStr.IndexOf('>') > 0)
 9             {
10                 int begin = ContentStr.IndexOf('<');
11                 int end = ContentStr.IndexOf('>');
12                 string SubContect = ContentStr.Substring(begin, end - begin + 1);
13                 ContentStr = ContentStr.Replace(SubContect, "");
14             }
15             ContentStr = ContentStr.Replace("&nbsp;", "");
16             return ContentStr.Trim();
17         }

View Code

第二步:分析數據

通過html正則模板獲取到匹配的正則,然后取得正則匹配的集合。放入自己的集合里分析它

 1 public List<String> GetListURl(string url)
 2         {
 3             string htmlContent = GetContentUrl(url);//取得網頁地址內容
 4 
 5             if (!string.IsNullOrWhiteSpace(htmlContent))
 6             {
 7                 return DealHtmlContentList(htmlContent);//調用處理方法得到list返回集合
 8             }
 9             return null;
10         }
11 private List<String> DealHtmlContentList(string htmlContent)
12         {
13             List<string> listStr = new List<string>();
14             string sLi = "<ul id="house-lst" class="house-lst">";//獲取的列表代碼段
15             string eLi = "</ul>";
16             string arryLi = string.Empty;
17             int start = htmlContent.IndexOf(sLi);
18             int end = 0;
19             if (start > 0)
20             {
21                 end = htmlContent.Substring(start).IndexOf(eLi);
22                 if (end > 0) arryLi = htmlContent.Substring(start, end);//通過截取得到列表代碼
23             }
24             if (!string.IsNullOrWhiteSpace(arryLi))
25             {
26                 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);//正則匹配li列表
27                 for (Match mch = regli.Match(arryLi); mch.Success; mch = mch.NextMatch())//放進集合
28                 {
29                     listStr.Add(mch.Value);
30                 }
31             }
32             return listStr;
33         }

View Code

這是獲取網頁內容代碼,截取到列表頁集合那段html代碼。匹配正則變成集合返回。這只是列表頁的數據

 1 public string GetListDetail(string url) {
 2             string htmlContent = GetContentUrl(url);//取得詳情頁地址內容
 3             if (!string.IsNullOrWhiteSpace(htmlContent))
 4             {
 5                 return DealHtmlContentDetail(htmlContent);//調用處理方法得到sql執行語句
 6             }
 7             return null;
 8         }
 9 
10 private string DealHtmlContentDetail(string htmlContent) {
11             string sql = string.Empty;
12             string sDiv = "<ol>";
13             string eDiv = "</ol>";
14             string arryDiv = string.Empty;
15             int start = htmlContent.IndexOf(sDiv);
16             int end = 0;
17             if (start > 0)
18             {
19                 end = htmlContent.Substring(start).IndexOf(eDiv);
20                 if (end > 0) arryDiv = htmlContent.Substring(start, end);
21             }
22 
23             if (!string.IsNullOrWhiteSpace(arryDiv))
24             {
25                 Regex regli = new Regex("<li>(.*?)</li>", RegexOptions.Singleline);
26                 Regex reglable = new Regex("<label>(.*?)</label>", RegexOptions.Singleline);
27                 Regex regspan = new Regex("<span class="other">(.*?)</span>", RegexOptions.Singleline);
28                 Match mlable, mspan;
29                 string InsertSql = "INSERT INTO LJHostInfo(Title,AveragePrice,";//sql語句拼接
30                 string InsertSqlParam = "('{0}','{1}',";
31                 for (Match mch = regli.Match(arryDiv); mch.Success; mch = mch.NextMatch())//匹配詳情數據
32                 {
33                     mlable = reglable.Match(mch.Value); mspan = regspan.Match(mch.Value);
34                     if (mlable.Success)
35                     {
36                         string value = ClearLable(mspan.Value);
37                         switch (ClearLable(mlable.Value))//分部比較并寫入sql語句拼接
38                         {
39                             case "建筑年代:":
40                                 InsertSql += "BuildYear,";
41                                 InsertSqlParam += "'" + value + "',";
42                                 break;
43                             case "建筑類型:":
44                                 InsertSql += "BuildType,";
45                                 InsertSqlParam += "'" + value + "',";
46                                 break;
47                             case "物業費用:":
48                                 InsertSql += "PropertyPrice,";
49                                 InsertSqlParam += "'" + value + "',";
50                                 break;
51                             case "物業公司:":
52                                 InsertSql += "PropertyCompany,";
53                                 InsertSqlParam += "'" + value + "',";
54                                 break;
55                             case "開發商:":
56                                 InsertSql += "Developers,";
57                                 InsertSqlParam += "'" + value + "',";
58                                 break;
59                             case "樓棟總數:":
60                                 InsertSql += "FloorNum,";
61                                 InsertSqlParam += "'" + value + "',";
62                                 //匹配容積率
63                                 if (mlable.NextMatch().Success)
64                                 {
65                                     InsertSql += "Rate,";
66                                     InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
67                                 }
68                                 break;
69                             case "房屋總數:":
70                                 InsertSql += "HousesNum,";
71                                 InsertSqlParam += "'" + value + "',";
72                                 //匹配綠化率
73                                 if (mlable.NextMatch().Success)
74                                 {
75                                     InsertSql += "GreenRates,";
76                                     InsertSqlParam += "'" + ClearLable(mspan.NextMatch().Value) + "',";
77                                 }
78                                 break;
79                             case "所屬學區:":
80                                 InsertSql += "SchoolAddress,";
81                                 InsertSqlParam += "'" + value + "',";
82                                 break;
83                             case "附近門店:":
84                                 InsertSql += "NearbyAddress,";
85                                 InsertSqlParam += "'" + ClearLable(mch.Value).Replace("附近門店:","").Trim().Replace(" ","") + "',";//獲取門店信息
86                                 break;
87                         }
88                     }
89                 }
90                 InsertSql = InsertSql.TrimEnd(',') + ") ";
91                 InsertSqlParam = InsertSqlParam.TrimEnd(',') + ")";
92                 sql = InsertSql + "VALUES" + InsertSqlParam;
93             }
94 
95             return sql;
96         }

View Code

需要注意的就是匹配數據去掉html標簽,加入sql語句。重復的匹配再插入

第三步:多線程任務類

 1 /// <summary>
 2     /// 任務執行入庫操作類
 3     /// </summary>
 4     public class ThreadWorker
 5     {
 6         private ClumbForm cForm;
 7         private List<String> list;
 8         private string siteUrl = "@$#@$#@$#@$@#$#@$#@$#@$";//加密處理(^_^)
 9         private LianJiaCaiJi caiji=new LianJiaCaiJi();
10 
11         public ThreadWorker(ClumbForm cf, List<String> _list)
12         {
13             cForm = cf;
14             list = _list;
15         }
16 
17         /// <summary>
18         /// 線程任務開始
19         /// </summary>
20         /// <param name="objParams"></param>
21         public void StartWorker()
22         {
23             string splitStr = string.Empty;
24             Regex regh2 = new Regex("<h2>(.*?)</h2>", RegexOptions.Singleline);
25             Regex regspan = new Regex("<span class="num">(.*?)</span>", RegexOptions.Singleline);
26             Match m;
27             Match ms;
28             foreach (var item in list)
29             {
30                 m = regh2.Match(item);
31                 if (m.Success)
32                 {
33                     lock (this)
34                     {
35                         ms = regspan.Match(item);
36                         cForm.TotalCount += 1;
37                         cForm.SBINSERTSQL.AppendFormat(caiji.GetListDetail(siteUrl + GetQuotationContent(m.Value, "href")), GetQuotationContent(m.Value, "title"), ms.Success ? caiji.ClearLable(ms.Value) : "0.00");
38                         cForm.ShowMsg("已完成:" + GetQuotationContent(m.Value, "title") + "小區,價格:"+ (ms.Success ? caiji.ClearLable(ms.Value) : "0.00")+ " 完成時間:" + System.DateTime.Now.ToString());
39                         cForm.ShowLableMsg(cForm.TotalCount+"");
40                     }
41                 }
42             }
43             cForm.ShowMsg("已完成第:" + cForm.TotalCount + "頁數據采集, 完成時間:" + System.DateTime.Now.ToString());
44         }
45 
46         /// <summary>
47         /// 取得雙引號中間的數據
48         /// </summary>
49         /// <param name="content"></param>
50         /// <returns></returns>
51         private string GetQuotationContent(string content,string tag) {
52             int s=content.IndexOf(tag)+2;
53             if ( s>= 0) {
54                 int tagS = content.Substring(s + tag.Length).IndexOf('"');
55                 return content.Substring(s + tag.Length, tagS);
56             }
57             return "";
58         }
59     
60     }

View Code

然后是任務執行

 1 private void btnCaiJi_Click(object sender, EventArgs e)
 2         {
 3             //初始狀態
 4             listBoxMessage.Items.Clear();
 5             IsComplete = false;
 6 
 7             if (string.IsNullOrWhiteSpace(txtPageStart.Text) || string.IsNullOrWhiteSpace(txtPageEnd.Text))
 8             {
 9                 MessageBox.Show("請輸入采集頁數!");
10                 return;
11             }
12             else if (int.Parse(txtPageStart.Text) > 100) {
13                 MessageBox.Show("采集頁數只能在100以內!");
14                 return;
15             }
16             ShowMsg("開始時間:" + System.DateTime.Now.ToString() + " 處理中請等待....");
17             _cts = new CancellationTokenSource();
18             ThreadPool.QueueUserWorkItem(state => CountTo(int.Parse(txtPageStart.Text), _cts.Token));
19 
20         }
21 
22         /// <summary>
23         /// 以累計的方式多線程采集數據
24         /// </summary>
25         /// <param name="countTo">累加到的指定值</param>
26         /// <param name="ct">取消憑證</param>
27         private void CountTo(int countTo, CancellationToken ct)
28         {
29             for (; countTo <= int.Parse(txtPageEnd.Text); countTo++)
30             {
31                 tw = new ThreadWorker(this, caiji.GetListURl(string.Format(url, countTo)), null);
32                 if (ct.IsCancellationRequested)
33                 {
34                     break;
35                 }
36                 //Invoke方法用于獲得創建控件的線程所在的上下文
37                 this.Invoke(new Action(tw.StartWorker));
38                 Thread.Sleep(200);
39             }
40             IsComplete = true;
41             ShowMsg("結束時間:" + System.DateTime.Now.ToString() + " 采集完成,總條數:"+TotalCount);
42         }
43 
44         /// <summary>
45         /// 實時信息顯示
46         /// </summary>
47         /// <param name="msg">提示信息</param>
48         public void ShowMsg(string msg)
49         {
50             try
51             {
52                 if (listBoxMessage.InvokeRequired)
53                 {
54                     GetMsgDelegate labDele = new GetMsgDelegate(ShowMsg);
55                     this.Invoke(labDele, new object[] { msg });
56                 }
57                 else
58                 {
59                     listBoxMessage.Items.Add(msg);
60                     listBoxMessage.SelectedItem = listBoxMessage.Items[listBoxMessage.Items.Count - 1];//設定listbox自動滾動
61                     if (IsComplete)
62                     {
63                         btnCaiJi.Enabled = true;
64                         btnExceSql.Enabled = true;
65                     }
66                     else
67                     {
68                         btnCaiJi.Enabled = false;
69                         btnExceSql.Enabled = false;
70                     }
71                 }
72             }
73             catch { }
74         }

View Code

執行時界面

總結

以上是生活随笔為你收集整理的网站数据采集程序(爬虫)的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。