當前位置：首頁 >

用C++解析HTTP下载下来的HTML文档

發(fā)布時間：2025/3/15 41 豆豆

生活随笔收集整理的這篇文章主要介紹了用C++解析HTTP下载下来的HTML文档小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.

?最近跟朋友一起寫了一個批量網(wǎng)站查詢工具 BlueCatTools，其中，需要用C++解析HTTP下載下來的HTML文檔。

懂的人不用我多說，不懂的我也沒能力說道你懂，看代碼吧。

??????? BlueCatTools 百度收錄批量查詢工具

//--caller.cpp-- // to run the program, you should make sure that, there is a "NIKE新浪競技風暴_新浪網(wǎng).htm" in your working directory. // The program run time can be saved about a half if you give a better implementation of the "ofile <<" stament; #include "HtmlParser.h" #include <ctime> #include <iomanip> using namespace std; void main() {clock_t start = clock();map<string, link_info> LinkInfo;multimap<float, link_info, greater<float> > Sorted;string FileName = "NIKE新浪競技風暴_新浪網(wǎng).htm";HtmlParser(FileName, LinkInfo);string Result;for(map<string, link_info>::iterator miter = LinkInfo.begin(); miter != LinkInfo.end(); miter++){Sorted.insert(make_pair(miter->second.Value, miter->second));}ofstream ofile;ofile.open("a.txt");for(multimap<float, link_info, greater<float> >::iterator miter = Sorted.begin(); miter != Sorted.end(); miter++){ofile << miter->first << "\t"<<setw(50) << left << miter->second.Title << "\t"<< miter->second.Link << endl;}ofile.close();cout << clock() - start << endl; }//--HtmlParser.h--/ #pragma once #include <cstdio> #include <iostream> #include <fstream> #include <string> #include <map> using namespace std; struct link_info {float Value;string Link;string Title; }; const int BUFFERSIZE = 10000; const int LOOKUP = 100; const int ASIZE = 300; //max length assumed of <a tag, string RepairTitle(string& Title) {string Result = "";for(string::iterator siter = Title.begin(); siter != Title.end(); siter++){unsigned char ch = *siter;if(ch == 0x0d || ch == 0x0a || ch == ' ' || ch == '\t'){if(*Result.rbegin() != '_')Result.push_back('_');}else Result.push_back(ch);}return Result; }bool HtmlParser(const string& FileName, map<string, link_info>& LinkInfo) {int i = 2000;FILE *fp;size_t ReadIn;char Dst[ASIZE];char buffer[BUFFERSIZE + 1];string Modified_Line;fp = fopen(FileName.c_str(), "rb");while(fp){ReadIn = fread(buffer, 1, BUFFERSIZE, fp);fseek(fp, - LOOKUP, SEEK_CUR);if(ReadIn == LOOKUP) break;buffer[ReadIn] = 0;Modified_Line.clear();char *p = buffer ;while(*p){unsigned ch = *p;if(ch >= 'A' && ch <= 'Z') Modified_Line.push_back(ch + 32);else Modified_Line.push_back(ch);p++;}string::size_type pos0;string::size_type pos1 = 0;while((pos0 = Modified_Line.find("<a", pos1)) != string::npos){string Atag, LAtag;pos1 = Modified_Line.find("</a", pos0);if(pos1 != string::npos){ if(pos1 - pos0 + 4 >= ASIZE) //make sure that Atag.size() < Asizecontinue;memset(Dst, 0, ASIZE);Atag = strncpy(Dst, buffer + pos0, pos1 - pos0 + 4); LAtag = Modified_Line.substr(pos0, pos1 - pos0 + 4);link_info tmpLink;{string::size_type pos0, pos1;pos1 = LAtag.find("</a");while(LAtag[pos1 - 1] == '>'){pos1 = LAtag.find_last_of("<", pos1 - 1);if(pos1 == 0) break;}pos0 = LAtag.find_last_of(">", pos1);string tmpstr = Atag.substr(pos0 + 1, pos1 - pos0 - 1);tmpLink.Title = RepairTitle(tmpstr);; }{string::size_type pos0, pos1;pos0 = LAtag.find("href",0);pos0 = LAtag.find_first_not_of("=\"\' ",pos0 + 4); // ",', ,=pos1 = LAtag.find_first_of("\"\' >", pos0 + 1); // ",', ,>tmpLink.Link = Atag.substr(pos0, pos1 - pos0); }tmpLink.Value = (i--) * 0.0005;if(tmpLink.Title.size() > 3 && tmpLink.Link.size() > 3) //filter: the filename.size() at least 3LinkInfo.insert(make_pair(tmpLink.Link, tmpLink)); //filter: the Link must be unique}}}return true; }

總結(jié)

以上是生活随笔為你收集整理的用C++解析HTTP下载下来的HTML文档的全部內(nèi)容，希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網(wǎng)站內(nèi)容還不錯，歡迎將生活随笔推薦給好友。

上一篇：近找到了一个免费的python教程，两周
下一篇：魔法师突然出现的C++

日韩av黄I国产麻豆传媒I国产91av视频在线观看I日韩一区二区三区在线看I美女国产在线I麻豆视频国产在线观看I成人黄色短片

用C++解析HTTP下载下来的HTML文档

??????? BlueCatTools 百度收錄批量查詢工具

總結(jié)