日韩性视频-久久久蜜桃-www中文字幕-在线中文字幕av-亚洲欧美一区二区三区四区-撸久久-香蕉视频一区-久久无码精品丰满人妻-国产高潮av-激情福利社-日韩av网址大全-国产精品久久999-日本五十路在线-性欧美在线-久久99精品波多结衣一区-男女午夜免费视频-黑人极品ⅴideos精品欧美棵-人人妻人人澡人人爽精品欧美一区-日韩一区在线看-欧美a级在线免费观看

歡迎訪問 生活随笔!

生活随笔

當前位置: 首頁 > 编程资源 > 编程问答 >内容正文

编程问答

一个文本分词程序

發(fā)布時間:2023/12/6 编程问答 30 豆豆
生活随笔 收集整理的這篇文章主要介紹了 一个文本分词程序 小編覺得挺不錯的,現在分享給大家,幫大家做個參考.

?WordMap類從分詞庫中讀入分詞?

將分詞存入unordered_map<std::string, int> 中

#pragma once #include<istream> #include<unordered_map> #include<string> #include<ctime> class WordMap { public:WordMap(const std::string& filename);~WordMap();bool init();std::unordered_map<std::string, int> m_map;std::string m_filename;private:time_t difftime;std::string timestr( tm*); }; #include"wordmap.h" #include<fstream> #include<iostream> #include<sstream> #include<ctime> WordMap::WordMap(const std::string& filename):m_filename(filename), difftime(5) { }WordMap::~WordMap() {}bool WordMap::init() {std::ifstream input(m_filename);std::istringstream inputstring;time_t last;time(&last);time_t cur;if (input.is_open()){std::string inputs;tm nowtime; localtime_s(&nowtime,&last);std::cout << "開始初始化分詞庫,當前時間" << timestr(&nowtime)<<std::endl;while (std::getline(input, inputs)){time(&cur);std::istringstream inputstring(inputs);int num;int num2;std::string word;inputstring >> num;inputstring >> word;inputstring >> num2;m_map[word] = num2;if (cur - last > difftime){std::cout << "已初始化分詞個數:" << m_map.size() << std::endl;last = cur;}}time(&cur);localtime_s(&nowtime, &cur);std::cout << "結束初始化分詞庫,當前時間" << timestr(&nowtime) << std::endl;}else{std::cerr << "can't not open file:" << m_filename;return false;}return true; }std::string WordMap::timestr(tm* nowtime) {std::ostringstream out;out << nowtime->tm_hour << ":" << nowtime->tm_min << ":" << nowtime->tm_sec;return std::move(out.str()); }

從文本中讀入,對文本進行分詞,分詞方法詳見

http://yangshangchuan.iteye.com/blog/2031813

以下是實現

#pragma once #include<string> using std::string; #include<vector> using std::vector; #include"wordmap.h" class FindWord { public:FindWord() {};~FindWord() {};vector<string> GetKeyWords(const string& filename,const WordMap& wordmap); private:int wsize = 5;bool ischinese(const char* c); public:int getlocalfindstring(const string& ostring, int begpos); }; @ -0,0 +1,71 @@ #include "findword.h" #include<fstream> #include<sstream> #include<iostream> using std::ifstream; using std::istringstream; vector<string> FindWord::GetKeyWords(const string & filename, const WordMap& wordmap) {vector<string> l_keyword;ifstream inputfile(filename);if (!inputfile.is_open()){std::cerr << "cann't not open file:" << filename;return l_keyword;}string sinput;string last;while (std::getline(inputfile, sinput)){last = sinput;int begpos = 0;int length;while ((length = getlocalfindstring(last, begpos)) != 0){int movelen = ischinese(&last[begpos]) ? 2:1;int findlen = -1;while (movelen<=length){string ls = last.substr(begpos, movelen);auto res = wordmap.m_map.find(ls);if (res != wordmap.m_map.end()){findlen = movelen;}movelen += ischinese(&last[begpos + movelen]) ? 2 : 1;}if (findlen != -1){l_keyword.push_back(last.substr(begpos, findlen));begpos = begpos + findlen;}else {begpos += length;}}}return l_keyword; }bool FindWord::ischinese(const char* c) {unsigned char cur = *c;unsigned char next = *(c + 1);if (next == 0)return false;return (cur >= 0xB0 && cur <= 0xF7) && (next >= 0xA1 && next <= 0xFE); }int FindWord::getlocalfindstring(const string& ostring,int begpos) {int size = wsize;int endpos = begpos;while (size > 0 && ostring[endpos]){if (ischinese(&ostring[endpos])) {endpos++;}size--;endpos++;}return endpos-begpos; }

樣例程序

@ -0,0 +1,16 @@ #include"wordmap.h" #include<iostream> #include<string> #include"findword.h" using std::string; int main() {WordMap m_wordmap("../../../word/word1.txt");FindWord m_findword;if (!m_wordmap.init()) { return 0; };vector<string> res= m_findword.GetKeyWords("../../../inputfile/1999.txt", m_wordmap);for (auto elems : res)std::cout << elems << " ";return 0; }

github:https://github.com/wuzhuorui/kjct.git

轉載于:https://www.cnblogs.com/creativityroom/p/8472253.html

總結

以上是生活随笔為你收集整理的一个文本分词程序的全部內容,希望文章能夠幫你解決所遇到的問題。

如果覺得生活随笔網站內容還不錯,歡迎將生活随笔推薦給好友。