读书人

大数据处理例之STL实战习题

发布时间: 2013-03-21 10:08:17 作者: rapoo

大数据处理例之STL实战练习

找出3000000条字符串中使用频率最高的10条

要求:高效

目的:锻炼STL的用法,涉及到STL各大类组件使用,包括自定义hash索引类,自定义hash函数,STL堆算法,C++类的设计方法等

主要用到hash_map和heap

生成原始数据脚本(matlab文件):

base = 97;alpha = 26;total = 3000000;max_len = 4;min_len = 1;filename = 'data.txt';fid = fopen(filename,'w+');for n=1:total    seed=mod(floor(rand()*100),max_len)+min_len;    for m=min_len:seed        fprintf(fid,char(mod(floor(rand()*100),alpha)+base));    end    fprintf(fid,'\n');endfclose(fid);
生成的数据文件大致为data.txt:
zcvyhkfpjohrdrcukvba
...
STL C++求解实现:
#include <functional>#include <algorithm>#include <iostream>#include <fstream>#include <string>#include <hash_map>using namespace std;using namespace stdext;class Node{public:Node(){_str="";};Node(const Node &n){_str = n.getv();};Node(const std::string & str):_str(str){};Node(const char * pstr):_str(pstr){};~Node(){};std::string  getv() const{return _str;}void setv(const std::string & str){_str = str;}Node & operator = (const Node & m){_str = m.getv();return *this;}friend bool operator < (const Node &n,const Node &m){return n._str < m._str;}friend bool operator == (const Node &n,const Node &m){return n._str == m._str;}friend ostream & operator << (ostream& o,const Node&n){o<<n.getv();return o;}protected:private:std::string _str;};inline size_t node_hash_value(const Node & n){return stdext::hash_value(n.getv().c_str());}struct node_hash_compare:public hash_compare<Node>{size_t operator ()(const Node &n)const{return node_hash_value(n);}bool operator () (const Node & n, const Node &m)const{return n.getv() < m.getv();}};ostream & operator << (std::ostream& o,const pair< Node,int> &n ){o<<n.first<<":"<<n.second;return o;}struct node_hash_greater{bool operator()(const pair<Node,int> &m,const pair<Node,int> &n){return m.second > n.second;}};const char *filename = "data.txt";const int find_num = 10;int main(){ifstream out;hash_map<Node,int,node_hash_compare> nodes;out.open(filename, ios::in);std::string line;  while(!out.eof())  {  std::getline(out,line);if (line.size() == 0){continue;}Node n(line);if (nodes.count(n) > 0){nodes[n] = nodes[n] + 1;}else{nodes[n] = 1;} }//copy(nodes.begin(),nodes.end(),std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));out.close();  pair<Node,int>  A[find_num+1];if (nodes.size() < find_num){copy(nodes.begin(),nodes.end(),std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));return 0;}hash_map<Node,int,node_hash_compare>::iterator it = nodes.begin();int s=0;for (;s<find_num;s++,it++){A[s].first = it->first;A[s].second = it->second;}make_heap(A, A + find_num,node_hash_greater());cout<<"\nheap start:\n";copy(A, A + find_num,std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));for (;it != nodes.end();it++){if (it->second > A[0].second){A[find_num].first = it->first;A[find_num].second = it->second;push_heap(A,A+find_num+1,node_hash_greater());pop_heap(A,A+find_num+1,node_hash_greater());}}sort_heap(A, A + find_num,node_hash_greater());cout<<"\nresult:\n";copy(A, A + find_num,std::ostream_iterator< std::pair< Node, int> >(cout,"\n"));return 0;}
 

结果:

大数据处理例之STL实战习题


读书人网 >编程

热点推荐