00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022 #ifndef _GRAPH_TOKENIZER_H
00023 #define _GRAPH_TOKENIZER_H
00024
00025 #include <fstream>
00026 #include "graph_vat.h"
00027 #include "element_parser.h"
00028 #include "generic_classes.h"
00029 #include "tokenizer_utils.h"
00030
00031 using namespace std;
00032
00033
00041 template<typename PP, typename MP, typename TP, typename PAT_ST, template<class, typename, typename, template <typename> class> class CC,
00042 template <typename> class ALLOC >
00043 class tokenizer<GRAPH_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00044 {
00045 public:
00046 typedef vat<GRAPH_PROP, V_Fk1_MINE_PROP, ALLOC, std::vector > VAT;
00047
00048 tokenizer(const int max=LINE_SZ): MAXLINE(max) {}
00055 template<class SM_T>
00056 int parse_next_trans(ifstream& infile, pat_fam<GRAPH_PATTERN>& freq_pats, storage_manager<GRAPH_PATTERN,
00057 VAT, ALLOC, SM_T>& vat_hmap) {
00058 char* line=new char[MAXLINE];
00059 char word[MAXLINE];
00060 char* startline=line;
00061
00062 int len;
00063 int count;
00064 int tid=-1;
00065 int num_items=-1;
00066 int pos;
00067 VAT* gvat;
00068 GRAPH_PATTERN* g1;
00069
00070 map<int, typename GRAPH_PATTERN::VERTEX_T> vid_to_lbl;
00071
00072 typename map<int, typename GRAPH_PATTERN::VERTEX_T>::iterator tmp_it;
00073
00074 while(1) {
00075 pos=infile.tellg();
00076 line=startline;
00077 *line='\0';
00078 infile.getline(line, MAXLINE-1);
00079 len=strlen(line);
00080 if(!len || !line) {
00081 delete[] startline;
00082 return tid;
00083 }
00084
00085 line[len++]='\0';
00086 count=0;
00087
00088 if(line[0]=='#')
00089 continue;
00090
00091 if(!(line=parse_word()(line, word))) {
00092
00093 delete[] startline;
00094 return -1;
00095 }
00096
00097 if(word[0]=='t') {
00098 if(tid!=-1) {
00099 infile.seekg(pos);
00100 delete[] startline;
00101 return tid;
00102
00103 }
00104
00105 line=parse_word()(line, word);
00106 if(!line) {
00107
00108 delete[] startline;
00109 return -1;
00110 }
00111
00112 line=parse_word()(line, word);
00113 if(!line) {
00114
00115 delete[] startline;
00116 return -1;
00117 }
00118 tid=atoi(word);
00119
00120 }
00121 else if(word[0]=='v') {
00122 num_items=2;
00123 int vid=0;
00124 typename GRAPH_PATTERN::VERTEX_T v_lbl;
00125
00126 while(count<num_items) {
00127 if(!(line=parse_word()(line, word))) {
00128
00129 delete[] startline;
00130 return -1;
00131 }
00132 switch(count) {
00133 case 0: vid=atoi(word); break;
00134 case 1:
00135 v_lbl=el_prsr.parse_element(word);
00142 vid_to_lbl.insert(make_pair(vid, v_lbl));
00143 }
00144 count++;
00145
00146 }
00147
00148 }
00149 else if(word[0]=='e') {
00153 int vid1, vid2;
00154 typename GRAPH_PATTERN::EDGE_T e_lbl;
00155 typename GRAPH_PATTERN::VERTEX_T v_lbl1, v_lbl2;
00156 num_items=3;
00157 bool swap_vids;
00158
00159 while(count<num_items) {
00160 if(!(line=parse_word()(line, word))) {
00161
00162 delete[] startline;
00163 return -1;
00164 }
00165
00166 switch(count) {
00167 case 0:
00168 vid1=atoi(word);
00169 if((tmp_it=vid_to_lbl.find(vid1))==vid_to_lbl.end()) {
00170 cerr<<"graph_tokenizer.parse_next_trans: vid "<<vid1<<" not found in vid_to_lbl"<<endl;
00171 return -1;
00172 }
00173 v_lbl1=tmp_it->second;
00174 break;
00175
00176 case 1:
00177 vid2=atoi(word);
00178 if((tmp_it=vid_to_lbl.find(vid2))==vid_to_lbl.end()) {
00179 cerr<<"graph_tokenizer.parse_next_trans: vid "<<vid2<<" not found in vid_to_lbl"<<endl;
00180 return -1;
00181 }
00182 v_lbl2=tmp_it->second;
00183 break;
00184
00185 case 2:
00186 e_lbl=edge_prsr.parse_element(word);
00193
00195 g1=new GRAPH_PATTERN;
00196 if(v_lbl1<=v_lbl2) {
00197 make_edge(g1, v_lbl1, v_lbl2, e_lbl);
00198 swap_vids=0;
00199 }
00200 else {
00201 make_edge(g1, v_lbl2, v_lbl1, e_lbl);
00202 swap_vids=1;
00203 }
00204
00209
00210 if(!(gvat=vat_hmap.get_vat(g1))) {
00211 gvat=new VAT;
00212 if(!swap_vids)
00213 gvat->insert_occurrence_tid(tid, make_pair(vid1, vid2));
00214 else
00215 gvat->insert_occurrence_tid(tid, make_pair(vid2, vid1));
00216
00217 gvat->insert_vid_tid(vid1);
00218 gvat->insert_vid(vid2);
00219 vat_hmap.add_vat(g1, gvat);
00220 freq_pats.push_back(g1);
00221
00222 }
00223 else if(gvat->back().first!=tid) {
00224 if(!swap_vids)
00225 gvat->insert_occurrence_tid(tid, make_pair(vid1, vid2));
00226 else
00227 gvat->insert_occurrence_tid(tid, make_pair(vid2, vid1));
00228
00229 gvat->insert_vid_tid(vid1);
00230 gvat->insert_vid(vid2);
00231 delete g1;
00232 }
00233 else {
00234 if(!swap_vids)
00235 gvat->insert_occurrence(make_pair(vid1, vid2));
00236 else
00237 gvat->insert_occurrence(make_pair(vid2, vid1));
00238
00239 gvat->insert_vid_hs(vid1);
00240 gvat->insert_vid(vid2);
00241 delete g1;
00242 }
00243
00244 }
00245 count++;
00246 }
00247
00248 }
00249 else {
00250 cerr<<"graph.tokenizer.parse_next_trans: Unidentifiable line="<<line<<endl;
00251 return -1;
00252 }
00253 }
00254
00255 return tid;
00256
00257 }
00258
00259 private:
00260 int MAXLINE;
00261 element_parser<typename GRAPH_PATTERN::VERTEX_T> el_prsr;
00262 element_parser<typename GRAPH_PATTERN::EDGE_T> edge_prsr;
00264 };
00265
00266 #endif
00267