當前位置:
首頁 >
用C++解析HTTP下载下来的HTML文档
發(fā)布時間:2025/3/15
41
豆豆
生活随笔
收集整理的這篇文章主要介紹了
用C++解析HTTP下载下来的HTML文档
小編覺得挺不錯的,現(xiàn)在分享給大家,幫大家做個參考.
?最近跟朋友一起寫了一個 批量網(wǎng)站查詢工具 BlueCatTools,其中,需要用C++解析HTTP下載下來的HTML文檔。
?
懂的人不用我多說,不懂的我也沒能力說道你懂,看代碼吧。
??????? BlueCatTools 百度收錄批量查詢工具
//--caller.cpp-- // to run the program, you should make sure that, there is a "NIKE新浪競技風暴_新浪網(wǎng).htm" in your working directory. // The program run time can be saved about a half if you give a better implementation of the "ofile <<" stament; #include "HtmlParser.h" #include <ctime> #include <iomanip> using namespace std; void main() {clock_t start = clock();map<string, link_info> LinkInfo;multimap<float, link_info, greater<float> > Sorted;string FileName = "NIKE新浪競技風暴_新浪網(wǎng).htm";HtmlParser(FileName, LinkInfo);string Result;for(map<string, link_info>::iterator miter = LinkInfo.begin(); miter != LinkInfo.end(); miter++){Sorted.insert(make_pair(miter->second.Value, miter->second));}ofstream ofile;ofile.open("a.txt");for(multimap<float, link_info, greater<float> >::iterator miter = Sorted.begin(); miter != Sorted.end(); miter++){ofile << miter->first << "\t"<<setw(50) << left << miter->second.Title << "\t"<< miter->second.Link << endl;}ofile.close();cout << clock() - start << endl; }//--HtmlParser.h--/ #pragma once #include <cstdio> #include <iostream> #include <fstream> #include <string> #include <map> using namespace std; struct link_info {float Value;string Link;string Title; }; const int BUFFERSIZE = 10000; const int LOOKUP = 100; const int ASIZE = 300; //max length assumed of <a tag, string RepairTitle(string& Title) {string Result = "";for(string::iterator siter = Title.begin(); siter != Title.end(); siter++){unsigned char ch = *siter;if(ch == 0x0d || ch == 0x0a || ch == ' ' || ch == '\t'){if(*Result.rbegin() != '_')Result.push_back('_');}else Result.push_back(ch);}return Result; }bool HtmlParser(const string& FileName, map<string, link_info>& LinkInfo) {int i = 2000;FILE *fp;size_t ReadIn;char Dst[ASIZE];char buffer[BUFFERSIZE + 1];string Modified_Line;fp = fopen(FileName.c_str(), "rb");while(fp){ReadIn = fread(buffer, 1, BUFFERSIZE, fp);fseek(fp, - LOOKUP, SEEK_CUR);if(ReadIn == LOOKUP) break;buffer[ReadIn] = 0;Modified_Line.clear();char *p = buffer ;while(*p){unsigned ch = *p;if(ch >= 'A' && ch <= 'Z') Modified_Line.push_back(ch + 32);else Modified_Line.push_back(ch);p++;}string::size_type pos0;string::size_type pos1 = 0;while((pos0 = Modified_Line.find("<a", pos1)) != string::npos){string Atag, LAtag;pos1 = Modified_Line.find("</a", pos0);if(pos1 != string::npos){ if(pos1 - pos0 + 4 >= ASIZE) //make sure that Atag.size() < Asizecontinue;memset(Dst, 0, ASIZE);Atag = strncpy(Dst, buffer + pos0, pos1 - pos0 + 4); LAtag = Modified_Line.substr(pos0, pos1 - pos0 + 4);link_info tmpLink;{string::size_type pos0, pos1;pos1 = LAtag.find("</a");while(LAtag[pos1 - 1] == '>'){pos1 = LAtag.find_last_of("<", pos1 - 1);if(pos1 == 0) break;}pos0 = LAtag.find_last_of(">", pos1);string tmpstr = Atag.substr(pos0 + 1, pos1 - pos0 - 1);tmpLink.Title = RepairTitle(tmpstr);; }{string::size_type pos0, pos1;pos0 = LAtag.find("href",0);pos0 = LAtag.find_first_not_of("=\"\' ",pos0 + 4); // ",', ,=pos1 = LAtag.find_first_of("\"\' >", pos0 + 1); // ",', ,>tmpLink.Link = Atag.substr(pos0, pos1 - pos0); }tmpLink.Value = (i--) * 0.0005;if(tmpLink.Title.size() > 3 && tmpLink.Link.size() > 3) //filter: the filename.size() at least 3LinkInfo.insert(make_pair(tmpLink.Link, tmpLink)); //filter: the Link must be unique}}}return true; }
總結(jié)
以上是生活随笔為你收集整理的用C++解析HTTP下载下来的HTML文档的全部內(nèi)容,希望文章能夠幫你解決所遇到的問題。
- 上一篇: 近找到了一个免费的python教程,两周
- 下一篇: 魔法师突然出现的C++