用c#编写爬虫在marinetraffic下载船仅仅图片
近期在做船僅僅識(shí)別方面的事情,須要大量的正樣本來(lái)訓(xùn)練adaboost分類(lèi)器。
于是到marinetraffic這個(gè)站點(diǎn)上下載船僅僅圖片。寫(xiě)個(gè)爬蟲(chóng)來(lái)自己主動(dòng)下載顯然非常方便。
站點(diǎn)特點(diǎn)
在介紹爬蟲(chóng)之前首先了解一下marinetraffic這個(gè)站點(diǎn)的一些特點(diǎn):
1. 會(huì)定期檢測(cè)爬蟲(chóng)行為。假設(shè)覺(jué)得有爬蟲(chóng)大量下載圖片。
會(huì)把該連接增加黑名單,后幾天都沒(méi)辦法下載。
2. 船僅僅圖片資源差異大。有的船僅僅有1000多張圖,有的船僅僅沒(méi)有一張圖,我們須要的是非常多船僅僅的非常多張圖。所以須要對(duì)下載的船僅僅按優(yōu)先級(jí)排序。
3. 用來(lái)訓(xùn)練分類(lèi)器的正樣本要求檢測(cè)對(duì)象的分辨率一樣。而marinetraffic站點(diǎn)下載的圖片能夠設(shè)置下在的圖片的寬度,站點(diǎn)依據(jù)長(zhǎng)寬比,生成對(duì)應(yīng)的高度。所以。不同圖片高度不一樣。須要自己后期處理。
解決方式
后期處理從圖片中摳出分辨率一樣的船僅僅
爬蟲(chóng)源代碼
using System; using System.Collections.Generic; using System.Globalization; using System.IO; using System.Linq; using System.Net; using System.Runtime.Serialization.Formatters.Binary; using System.Text; using System.Text.RegularExpressions; using System.Threading; using System.Threading.Tasks;namespace 船僅僅圖像爬蟲(chóng) {class Program{static void download_all_shipid(List<string> shipid_list){try{WebClient MyWebClient = new WebClient();MyWebClient.Headers["User-Agent"] = "blah";MyWebClient.Credentials = CredentialCache.DefaultCredentials;//獲取或設(shè)置用于向Internet資源的請(qǐng)求進(jìn)行身份驗(yàn)證的網(wǎng)絡(luò)憑據(jù);//Console.WriteLine("here1");//http://www.marinetraffic.com/en/photos/of/ships/shipid:281519///http://www.marinetraffic.com/en/ais/index/ships/all//http://www.marinetraffic.com/ais/index/ships/all/page:2/sort:COUNT_PHOTOS/direction:desc;for (int pageNum = 1; pageNum < 100; pageNum++){Console.WriteLine("開(kāi)始分析第" + pageNum + "張網(wǎng)頁(yè)");MyWebClient.Credentials = CredentialCache.DefaultCredentials;//獲取或設(shè)置用于向Internet資源的請(qǐng)求進(jìn)行身份驗(yàn)證的網(wǎng)絡(luò)憑據(jù);MyWebClient.Headers["User-Agent"] = "blah";try{//Console.WriteLine("here0");Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/ais/index/ships/all/page:" + pageNum + "/sort:COUNT_PHOTOS/direction:desc/per_page:50"); //從指定站點(diǎn)下載數(shù)據(jù)//pageHtml = Encoding.Default.GetString(pageData); //假設(shè)獲取站點(diǎn)頁(yè)面採(cǎi)用的是GB2312,則使用這句; string pageHtml = Encoding.UTF8.GetString(pageData); //假設(shè)獲取站點(diǎn)頁(yè)面採(cǎi)用的是UTF-8。則使用這句;//Console.WriteLine(pageHtml);//在控制臺(tái)輸入獲取的內(nèi)容;//Console.WriteLine("here1");int urlindex = -1;string org_label = "shipid:";urlindex = pageHtml.IndexOf(org_label, urlindex + 1);while (urlindex != -1){int endOfUrl = pageHtml.IndexOf("/", urlindex + org_label.Length);//Console.WriteLine("here2");string shipid = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);if (!shipid_list.Contains(shipid)){Console.WriteLine("新增id:" + shipid);shipid_list.Add(shipid);}//Console.WriteLine("已有id:" + shipid);urlindex = pageHtml.IndexOf(org_label, urlindex + 1);}///保存網(wǎng)頁(yè)//using (StreamWriter sw = new StreamWriter("ouput.html"))//將獲取的內(nèi)容寫(xiě)入文本//{// sw.Write(pageHtml);//}Console.WriteLine("完畢第" + pageNum + "頁(yè)分析");}catch (WebException webEx){Console.WriteLine(webEx.Message.ToString());}//以下是一個(gè)隨機(jī)數(shù)的方法保證10秒后再下載。以繞過(guò)違規(guī)檢測(cè)。Console.Write("繞開(kāi)站點(diǎn)爬蟲(chóng)行為檢測(cè)中......");Random rd = new Random();int time_sleep = rd.Next() % 10 + 10;Thread.Sleep(time_sleep * 1000);Console.WriteLine();}Console.WriteLine("分析結(jié)束");//以下把list內(nèi)容保存進(jìn)文件,使用序列化的方法;string file = @"C:\Users\dragonfive\Desktop\爬蟲(chóng)獲得船僅僅圖片\第三批\0_100page_shipid.txt";using (FileStream fsWriter = new FileStream(file, FileMode.OpenOrCreate, FileAccess.Write)){//以下對(duì)stu進(jìn)行序列化。BinaryFormatter bf = new BinaryFormatter();bf.Serialize(fsWriter, shipid_list);}}catch (WebException webEx){Console.WriteLine(webEx.Message.ToString());}}/// <summary>/// 依據(jù)得到的ship_id獲得該ship_id的全部圖片;/// </summary>/// <param name="ship_id"></param>static void download_jpg(string ship_id){try{Console.WriteLine("開(kāi)始下載shipid為:"+ship_id+"的圖片");WebClient MyWebClient = new WebClient();MyWebClient.Credentials = CredentialCache.DefaultCredentials;//獲取或設(shè)置用于向Internet資源的請(qǐng)求進(jìn)行身份驗(yàn)證的網(wǎng)絡(luò)憑據(jù)MyWebClient.Headers["User-Agent"] = "blah";//http://www.marinetraffic.com/en/photos/of/ships/shipid:281519///http://www.marinetraffic.com/en/photos/of/ships/shipid:371668/per_page:1000/page:1Byte[] pageData = MyWebClient.DownloadData(@"http://www.marinetraffic.com/en/photos/of/ships/shipid:" + ship_id + @"/per_page:100/page:1"); //從指定站點(diǎn)下載數(shù)據(jù)//string pageHtml = Encoding.Default.GetString(pageData); //假設(shè)獲取站點(diǎn)頁(yè)面採(cǎi)用的是GB2312。則使用這句 string pageHtml = Encoding.UTF8.GetString(pageData); //假設(shè)獲取站點(diǎn)頁(yè)面採(cǎi)用的是UTF-8,則使用這句//Console.WriteLine(pageHtml);//在控制臺(tái)輸入獲取的內(nèi)容Console.WriteLine("元網(wǎng)頁(yè)已下載");//using (StreamWriter sw = new StreamWriter("ouput.html"))//將獲取的內(nèi)容寫(xiě)入文本//{// sw.Write(pageHtml);//}int urlindex = -1;string org_label = "data-original='";urlindex = pageHtml.IndexOf(org_label, urlindex + 1);int i = 0;//Directory.CreateDirectory(@"./" );while (urlindex != -1){int endOfUrl = pageHtml.IndexOf("'", urlindex + org_label.Length);string url = pageHtml.Substring(urlindex + org_label.Length, endOfUrl - urlindex - org_label.Length);////以下是unicode編碼轉(zhuǎn)換為string的方式;//MatchCollection mc = Regex.Matches(strName, @"\\u([\w]{2})([\w]{2})", RegexOptions.Compiled | RegexOptions.IgnoreCase);//byte[] bts = new byte[2];//foreach (Match m in mc)//{// bts[0] = (byte)int.Parse(m.Groups[2].Value, NumberStyles.HexNumber);// bts[1] = (byte)int.Parse(m.Groups[1].Value, NumberStyles.HexNumber);// musicName += Encoding.Unicode.GetString(bts);//}//Console.WriteLine("接下來(lái)下載的是:" + musicName);//以下是一個(gè)隨機(jī)數(shù)的方法保證10秒后再下載。以繞過(guò)違規(guī)檢測(cè)。Console.Write("繞過(guò)站點(diǎn)爬蟲(chóng)行為檢測(cè)中......");Random rd = new Random();int time_sleep = rd.Next() % 10 + 10;Thread.Sleep(time_sleep * 1000);Console.WriteLine();try{//這是下載的命令;Console.WriteLine(url);MyWebClient.Credentials = CredentialCache.DefaultCredentials;//獲取或設(shè)置用于向Internet資源的請(qǐng)求進(jìn)行身份驗(yàn)證的網(wǎng)絡(luò)憑據(jù)MyWebClient.Headers["User-Agent"] = "blah";Byte[] jpgdata = MyWebClient.DownloadData(url); //從指定網(wǎng)頁(yè)下載數(shù)據(jù);//把下載的內(nèi)容保存在一個(gè)地方;using (FileStream fs = new FileStream(@"C:\Users\dragonfive\Desktop\爬蟲(chóng)獲得船僅僅圖片\第三批\" + ship_id + "_" + i + ".jpg", FileMode.OpenOrCreate, FileAccess.Write)){fs.Write(jpgdata, 0, jpgdata.Length);}}catch (WebException webEx){Console.WriteLine("被捕獲了嗎?");Console.WriteLine(webEx.Message.ToString());}Console.WriteLine("成功下載第" + (i ++) + "張圖片");urlindex = pageHtml.IndexOf(org_label, urlindex + 1);}///保存網(wǎng)頁(yè)//using (StreamWriter sw = new StreamWriter("ouput.html"))//將獲取的內(nèi)容寫(xiě)入文本//{// sw.Write(pageHtml);//}Console.WriteLine("*****************************************");Console.WriteLine("下載"+i+"張ship_id為"+ship_id+"的圖片");Console.WriteLine("*****************************************");//Console.ReadLine(); //讓控制臺(tái)暫停,否則一閃而過(guò)了 }catch (WebException webEx){Console.WriteLine(webEx.Message.ToString());}}static void Main(string[] args){List<string> shipid_list = new List<string>();//shipid_list.Add("371681");//臨時(shí)高速產(chǎn)生圖片用這個(gè);download_all_shipid(shipid_list);//string file = @"C:\Users\dragonfive\Desktop\爬蟲(chóng)獲得船僅僅圖片\第三批\0_100page_shipid.txt";//using (FileStream fsReader = new FileStream(file, FileMode.Open, FileAccess.Read))//{// //以下進(jìn)行反序列話;// BinaryFormatter bf = new BinaryFormatter();// shipid_list = (List<string>)bf.Deserialize(fsReader);// Console.WriteLine("成功加載" + shipid_list.Count + "個(gè)shipid");//}////371652 371668 371681 1252401 //shipid_list.Remove("371652");//shipid_list.Remove("371668");//shipid_list.Remove("371681");//shipid_list.Remove("1252401");////132264//shipid_list.Remove("371077");//shipid_list.Remove("132264");//shipid_list.Remove("224871");//shipid_list.Remove("279923");//shipid_list.Remove("369163");//shipid_list.Remove("266342");//shipid_list.Remove("371216");//shipid_list.Remove("368174");//shipid_list.Remove("369163");foreach (var ship_id in shipid_list){download_jpg(ship_id);}Console.ReadLine(); //讓控制臺(tái)暫停,否則一閃而過(guò)了 }} }轉(zhuǎn)載于:https://www.cnblogs.com/yutingliuyl/p/6941828.html
總結(jié)
以上是生活随笔為你收集整理的用c#编写爬虫在marinetraffic下载船仅仅图片的全部?jī)?nèi)容,希望文章能夠幫你解決所遇到的問(wèn)題。
- 上一篇: java项目红叉_完美解决Eclipse
- 下一篇: C#的四舍五入函数