.Net开源的跨平台爬虫框架 DotnetSpider
項目詳細介紹
?
DotnetSpider是開源的.NET跨平臺數據采集爬蟲框架。需要?Scheduler,Downloader ,Processor,Pipeline 四部分。
?
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 | ??public?static?void?Main() ????{ ????????HttpClientDownloader?downloader?=?new?HttpClientDownloader(); ? ????????Core.Spider?spider?=?Core.Spider.Create(new?MyPageProcessor(),?new?QueueDuplicateRemovedScheduler()).AddPipeline(new?MyPipeline()).SetThreadNum(1); ????????var?site?=?new?Site()?{?EncodingName?=?"UTF-8"?}; ????????for?(int?i?=?1;?i?<?5;?++i) ????????{ ????????????site.AddStartUrl("http://www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_1.html"); ????????} ????????spider.Site?=?site; ????????spider.Start(); ????} ? ????private?class?MyPipeline?:?IPipeline ????{ ????????public?void?Process(ResultItems?resultItems,?ISpider?spider) ????????{ ????????????foreach?(YoukuVideo?entry?in?resultItems.Results["VideoResult"]) ????????????{ ????????????????Console.WriteLine($"{entry.Name}:{entry.Click}"); ????????????} ? ????????????//May?be?you?want?to?save?to?database ????????????//? ????????} ? ????????public?void?Dispose() ????????{ ????????} ????} ? ????private?class?MyPageProcessor?:?IPageProcessor ????{ ????????public?void?Process(Page?page) ????????{ ????????????var?totalVideoElements?=?page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-col3']")).Nodes(); ????????????List<YoukuVideo>?results?=?new?List<YoukuVideo>(); ????????????foreach?(var?videoElement?in?totalVideoElements) ????????????{ ????????????????var?video?=?new?YoukuVideo(); ????????????????video.Name?=?videoElement.Select(Selectors.XPath("/div[4]/div[1]/a")).Value; ????????????????video.Click?=?int.Parse(videoElement.Select(Selectors.Css("p-num")).Value.ToString()); ????????????????results.Add(video); ????????????} ????????????page.AddResultItem("VideoResult",?results); ????????} ? ????????public?Site?Site?=>?new?Site?{?SleepTime?=?0?}; ????} ? ????public?class?YoukuVideo ????{ ????????public?string?Name?{?get;?set;?} ????????public?string?Click?{?get;?set;?} ????} |
?
??
添加config 文件:?
app.conf to your project?
?
| 1 2 | redisServer:your?redis?server? redisPassword:your?redis?password |
?
添加爬蟲上下文類:
| 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 | public?class?JdSkuSpider?:?ISpiderContext { ????public?SpiderContextBuilder?GetBuilder() ????{ ????????Log.TaskId?=?"JD?SKU?Weekly"; ????????SpiderContext?context?=?new?SpiderContext ????????{ ????????????SpiderName?=?"JD?SKU?"?+?DateTimeUtils.MONDAY_RUN_ID, ????????????CachedSize?=?1, ????????????ThreadNum?=?8, ????????????Site?=?new?Site ????????????{ ????????????????EncodingName?=?"UTF-8" ????????????}, ????????????Scheduler?=?new?RedisScheduler() ????????????{ ????????????????Host?=?"redis", ????????????????Port?=?6379, ????????????????Password?=?"" ????????????}, ????????????StartUrls=new?Dictionary<string,?Dictionary<string,?object>>?{ ????????????????{?"http://list.jd.com/list.html?cat=9987,653,655&page=1&go=0&JL=6_0_0&ms=5",?new?Dictionary<string,?object>?{?{?"name","手機"?},?{?"cat3","9987"?}?}?}, ????????????}, ????????????Pipeline?=?new?MysqlPipeline() ????????????{ ????????????????ConnectString?=?"" ????????????}, ????????????Downloader?=?new?HttpDownloader() ????????}; ????????return?new?SpiderContextBuilder(context,?typeof(Product)); ????} ? ????[Schema("jd",?"sku_v2",?Suffix?=?TableSuffix.Monday)] ????[TargetUrl(new[]?{?@"page=[0-9]+"?},?"//*[@id=\"J_bottomPage\"]")] ????[TypeExtractBy(Expression?=?"//div[contains(@class,'j-sku-item')]",?Multi?=?true)] ????[Indexes(Primary?=?"sku")] ????public?class?Product?:?ISpiderEntity ????{ ????????private?static?readonly?DateTime?runId; ? ????????static?Product() ????????{ ????????????DateTime?dt?=?DateTime.Now; ????????????runId?=?new?DateTime(dt.Year,?dt.Month,?1); ????????} ? ????????[StoredAs("category",?DataType.String,?20)] ????????[PropertyExtractBy(Expression?=?"name",?Type?=?ExtractType.Enviroment)] ????????public?string?CategoryName?{?get;?set;?} ? ????????[StoredAs("cat3",?DataType.String,?20)] ????????[PropertyExtractBy(Expression?=?"cat3",?Type?=?ExtractType.Enviroment)] ????????public?int?CategoryId?{?get;?set;?} ? ????????[StoredAs("url",?DataType.Text)] ????????[PropertyExtractBy(Expression?=?"./div[1]/a/@href")] ????????public?string?Url?{?get;?set;?} ? ????????[StoredAs("sku",?DataType.String,?25)] ????????[PropertyExtractBy(Expression?=?"./@data-sku")] ????????public?string?Sku?{?get;?set;?} ? ????????[StoredAs("commentscount",?DataType.String,?20)] ????????[PropertyExtractBy(Expression?=?"./div[@class='p-commit']/strong/a")] ????????public?long?CommentsCount?{?get;?set;?} ? ????????[StoredAs("shopname",?DataType.String,?100)] ????????[PropertyExtractBy(Expression?=?"./div[@class='p-shop?hide']/span[1]/a[1]")] ????????public?string?ShopName?{?get;?set;?} ? ????????[StoredAs("name",?DataType.String,?50)] ????????[PropertyExtractBy(Expression?=?"./div[@class='p-name']/a/em")] ????????public?string?Name?{?get;?set;?} ? ????????[StoredAs("shopid",?DataType.String,?25)] ????????public?string?ShopId?{?get;?set;?} ? ????????[StoredAs("venderid",?DataType.String,?25)] ????????[PropertyExtractBy(Expression?=?"./@venderid")] ????????public?string?VenderId?{?get;?set;?} ? ????????[StoredAs("jdzy_shop_id",?DataType.String,?25)] ????????[PropertyExtractBy(Expression?=?"./@jdzy_shop_id")] ????????public?string?JdzyShopId?{?get;?set;?} ? ????????[StoredAs("cdate",?DataType.Time)] ????????[PropertyExtractBy(Expression?=?"now",?Type?=?ExtractType.Enviroment)] ????????public?DateTime?CDate?=>?DateTime.Now; ????} } |
總結
以上是生活随笔為你收集整理的.Net开源的跨平台爬虫框架 DotnetSpider的全部內容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 一年瘦到50斤:女子减肥致多脏器衰竭住进
- 下一篇: WebApi系列(从.Net 到 .Ne