日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 运维知识 > linux >内容正文

linux

Linux C++ 简单爬虫

發布時間:2023/11/30 linux 33 豆豆
生活随笔 收集整理的這篇文章主要介紹了 Linux C++ 简单爬虫 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

?

轉載:http://blog.csdn.net/orthocenterchocolate/article/details/38665937

方便易用,傳入URL,返回對應頁面的內容

[cpp]?view plain?copy
  • #include?<iostream>??
  • #include?<string>??
  • #include?<netdb.h>??
  • #include?<string.h>??
  • #include?<stdlib.h>??
  • using?namespace?std;??
  • ??
  • void?parseHostAndPagePath(const?string?url,?string?&hostUrl,?string?&pagePath){??
  • ????hostUrl=url;??
  • ????pagePath="/";??
  • ????int?pos=hostUrl.find("http://");??
  • ????if(-1!=pos)??
  • ????????hostUrl=hostUrl.replace(pos,7,"");??
  • ????pos=hostUrl.find("https://");??
  • ????if(-1!=pos)??
  • ????????hostUrl=hostUrl.replace(pos,8,"");??
  • ????pos=hostUrl.find("/");??
  • ????if(-1!=pos){??
  • ????????pagePath=hostUrl.substr(pos);??
  • ????????hostUrl=hostUrl.substr(0,pos);??
  • ????}??
  • }??
  • ??
  • string?getPageContent(const?string?url){??
  • ????struct?hostent?*host;??
  • ????string?hostUrl,?pagePath;??
  • ????parseHostAndPagePath(url,?hostUrl,?pagePath);??
  • ????if(0==(host=gethostbyname(hostUrl.c_str()))){??
  • ????????cout<<"gethostbyname?error\n"<<endl;??
  • ????????exit(1);??
  • ????}??
  • ??
  • ????struct?sockaddr_in?pin;??
  • ????int?port=80;??
  • ????bzero(&pin,sizeof(pin));??
  • ????pin.sin_family=AF_INET;??
  • ????pin.sin_port=htons(port);??
  • ????pin.sin_addr.s_addr=((struct?in_addr*)(host->h_addr))->s_addr;??
  • ????int?isock;??
  • ????if((isock?=?socket(AF_INET,?SOCK_STREAM,?0))==-1){??
  • ????????cout<<"open?socket?error\n"<<endl;??
  • ????????exit(1);??
  • ????}??
  • ??
  • ????string?requestHeader;??
  • ????requestHeader="GET?"+pagePath+"?HTTP/1.1\r\n";??
  • ????requestHeader+="Host:?"+hostUrl+"\r\n";??
  • ????requestHeader+="Accept:?*/*\r\n";??
  • ????requestHeader+="User-Agent:?Mozilla/4.0(compatible)\r\n";??
  • ????requestHeader+="connection:Keep-Alive\r\n";??
  • ????requestHeader+="\r\n";??
  • ??
  • ????if(connect(isock,?(const?sockaddr*)&pin,?sizeof(pin))==-1){??
  • ????????cout<<"connect?error\n"<<endl;??
  • ????????exit(1);??
  • ????}??
  • ????if(send(isock,?requestHeader.c_str(),?requestHeader.size(),?0)==-1){??
  • ????????cout<<"send?error\n"<<endl;??
  • ????????exit(1);??
  • ????}??
  • ??
  • ????struct?timeval?timeout={1,0};??
  • ????setsockopt(isock,?SOL_SOCKET,?SO_RCVTIMEO,?(char?*)&timeout,?sizeof(struct?timeval));??
  • ????char?c;??
  • ????bool?flag=true;??
  • ????while(recv(isock,?&c,?1,?0)>0){??
  • ????????if('\r'==c){??
  • ????????????continue;??
  • ????????}else?if('\n'==c){??
  • ????????????if(false==flag)??
  • ????????????????break;??
  • ????????????flag=false;??
  • ????????}else{??
  • ????????????flag=true;??
  • ????????}??
  • ????}??
  • ??
  • ????int?len,?BUFFER_SIZE=512;??
  • ????char?buffer[BUFFER_SIZE];??
  • ????string?pageContent="";??
  • ????while((len?=?recv(isock,?buffer,?BUFFER_SIZE-1,?0))>0){??
  • ????????buffer[len]='\0';??
  • ????????pageContent+=buffer;??
  • ????}??
  • ??
  • ????return?pageContent;??
  • }??
  • ??
  • int?main()??
  • {??
  • ????cout<<getPageContent("http://www.hao123.com")<<endl;??
  • ????return?0;??
  • }??

  • 總結

    以上是生活随笔為你收集整理的Linux C++ 简单爬虫的全部內容,希望文章能夠幫你解決所遇到的問題。

    如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。