日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程语言 > asp.net >内容正文

asp.net

.Net开源的跨平台爬虫框架 DotnetSpider

發布時間:2023/12/10 asp.net 34 豆豆
生活随笔 收集整理的這篇文章主要介紹了 .Net开源的跨平台爬虫框架 DotnetSpider 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

項目詳細介紹

?

DotnetSpider是開源的.NET跨平臺數據采集爬蟲框架。需要?Scheduler,Downloader ,Processor,Pipeline 四部分。

?

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

??public?static?void?Main()

????{

????????HttpClientDownloader?downloader?=?new?HttpClientDownloader();

?

????????Core.Spider?spider?=?Core.Spider.Create(new?MyPageProcessor(),?new?QueueDuplicateRemovedScheduler()).AddPipeline(new?MyPipeline()).SetThreadNum(1);

????????var?site?=?new?Site()?{?EncodingName?=?"UTF-8"?};

????????for?(int?i?=?1;?i?<?5;?++i)

????????{

????????????site.AddStartUrl("http://www.youku.com/v_olist/c_97_g__a__sg__mt__lg__q__s_1_r_0_u_0_pt_0_av_0_ag_0_sg__pr__h__d_1_p_1.html");

????????}

????????spider.Site?=?site;

????????spider.Start();

????}

?

????private?class?MyPipeline?:?IPipeline

????{

????????public?void?Process(ResultItems?resultItems,?ISpider?spider)

????????{

????????????foreach?(YoukuVideo?entry?in?resultItems.Results["VideoResult"])

????????????{

????????????????Console.WriteLine($"{entry.Name}:{entry.Click}");

????????????}

?

????????????//May?be?you?want?to?save?to?database

????????????//?

????????}

?

????????public?void?Dispose()

????????{

????????}

????}

?

????private?class?MyPageProcessor?:?IPageProcessor

????{

????????public?void?Process(Page?page)

????????{

????????????var?totalVideoElements?=?page.Selectable.SelectList(Selectors.XPath("//div[@class='yk-col3']")).Nodes();

????????????List<YoukuVideo>?results?=?new?List<YoukuVideo>();

????????????foreach?(var?videoElement?in?totalVideoElements)

????????????{

????????????????var?video?=?new?YoukuVideo();

????????????????video.Name?=?videoElement.Select(Selectors.XPath("/div[4]/div[1]/a")).Value;

????????????????video.Click?=?int.Parse(videoElement.Select(Selectors.Css("p-num")).Value.ToString());

????????????????results.Add(video);

????????????}

????????????page.AddResultItem("VideoResult",?results);

????????}

?

????????public?Site?Site?=>?new?Site?{?SleepTime?=?0?};

????}

?

????public?class?YoukuVideo

????{

????????public?string?Name?{?get;?set;?}

????????public?string?Click?{?get;?set;?}

????}

?

??

添加config 文件:?

app.conf to your project?

?

1

2

redisServer:your?redis?server?

redisPassword:your?redis?password

?

添加爬蟲上下文類:

1

2

3

4

5

6

7

8

9

10

11

12

13

14

15

16

17

18

19

20

21

22

23

24

25

26

27

28

29

30

31

32

33

34

35

36

37

38

39

40

41

42

43

44

45

46

47

48

49

50

51

52

53

54

55

56

57

58

59

60

61

62

63

64

65

66

67

68

69

70

71

72

73

74

75

76

77

78

79

80

81

82

83

84

85

86

87

88

89

90

public?class?JdSkuSpider?:?ISpiderContext

{

????public?SpiderContextBuilder?GetBuilder()

????{

????????Log.TaskId?=?"JD?SKU?Weekly";

????????SpiderContext?context?=?new?SpiderContext

????????{

????????????SpiderName?=?"JD?SKU?"?+?DateTimeUtils.MONDAY_RUN_ID,

????????????CachedSize?=?1,

????????????ThreadNum?=?8,

????????????Site?=?new?Site

????????????{

????????????????EncodingName?=?"UTF-8"

????????????},

????????????Scheduler?=?new?RedisScheduler()

????????????{

????????????????Host?=?"redis",

????????????????Port?=?6379,

????????????????Password?=?""

????????????},

????????????StartUrls=new?Dictionary<string,?Dictionary<string,?object>>?{

????????????????{?"http://list.jd.com/list.html?cat=9987,653,655&page=1&go=0&JL=6_0_0&ms=5",?new?Dictionary<string,?object>?{?{?"name","手機"?},?{?"cat3","9987"?}?}?},

????????????},

????????????Pipeline?=?new?MysqlPipeline()

????????????{

????????????????ConnectString?=?""

????????????},

????????????Downloader?=?new?HttpDownloader()

????????};

????????return?new?SpiderContextBuilder(context,?typeof(Product));

????}

?

????[Schema("jd",?"sku_v2",?Suffix?=?TableSuffix.Monday)]

????[TargetUrl(new[]?{?@"page=[0-9]+"?},?"//*[@id=\"J_bottomPage\"]")]

????[TypeExtractBy(Expression?=?"//div[contains(@class,'j-sku-item')]",?Multi?=?true)]

????[Indexes(Primary?=?"sku")]

????public?class?Product?:?ISpiderEntity

????{

????????private?static?readonly?DateTime?runId;

?

????????static?Product()

????????{

????????????DateTime?dt?=?DateTime.Now;

????????????runId?=?new?DateTime(dt.Year,?dt.Month,?1);

????????}

?

????????[StoredAs("category",?DataType.String,?20)]

????????[PropertyExtractBy(Expression?=?"name",?Type?=?ExtractType.Enviroment)]

????????public?string?CategoryName?{?get;?set;?}

?

????????[StoredAs("cat3",?DataType.String,?20)]

????????[PropertyExtractBy(Expression?=?"cat3",?Type?=?ExtractType.Enviroment)]

????????public?int?CategoryId?{?get;?set;?}

?

????????[StoredAs("url",?DataType.Text)]

????????[PropertyExtractBy(Expression?=?"./div[1]/a/@href")]

????????public?string?Url?{?get;?set;?}

?

????????[StoredAs("sku",?DataType.String,?25)]

????????[PropertyExtractBy(Expression?=?"./@data-sku")]

????????public?string?Sku?{?get;?set;?}

?

????????[StoredAs("commentscount",?DataType.String,?20)]

????????[PropertyExtractBy(Expression?=?"./div[@class='p-commit']/strong/a")]

????????public?long?CommentsCount?{?get;?set;?}

?

????????[StoredAs("shopname",?DataType.String,?100)]

????????[PropertyExtractBy(Expression?=?"./div[@class='p-shop?hide']/span[1]/a[1]")]

????????public?string?ShopName?{?get;?set;?}

?

????????[StoredAs("name",?DataType.String,?50)]

????????[PropertyExtractBy(Expression?=?"./div[@class='p-name']/a/em")]

????????public?string?Name?{?get;?set;?}

?

????????[StoredAs("shopid",?DataType.String,?25)]

????????public?string?ShopId?{?get;?set;?}

?

????????[StoredAs("venderid",?DataType.String,?25)]

????????[PropertyExtractBy(Expression?=?"./@venderid")]

????????public?string?VenderId?{?get;?set;?}

?

????????[StoredAs("jdzy_shop_id",?DataType.String,?25)]

????????[PropertyExtractBy(Expression?=?"./@jdzy_shop_id")]

????????public?string?JdzyShopId?{?get;?set;?}

?

????????[StoredAs("cdate",?DataType.Time)]

????????[PropertyExtractBy(Expression?=?"now",?Type?=?ExtractType.Enviroment)]

????????public?DateTime?CDate?=>?DateTime.Now;

????}

}

總結

以上是生活随笔為你收集整理的.Net开源的跨平台爬虫框架 DotnetSpider的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。