graph_tokenizer.h

00001 /*
00002  *  Copyright (C) 2005 M.J. Zaki <zaki@cs.rpi.edu> Rensselaer Polytechnic Institute
00003  *  Written by parimi@cs.rpi.edu
00004  *  Updated by chaojv@cs.rpi.edu, alhasan@cs.rpi.edu, salems@cs.rpi.edu
00005  *  Modifications:
00006  *      Added tokenizer properties -- Zaki, 5/8/06
00007  *
00008  *  This program is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU General Public License
00010  *  as published by the Free Software Foundation; either version 2
00011  *  of the License, or (at your option) any later version.
00012  *
00013  *  This program is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  *  GNU General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU General Public License along
00019  *  with this program; if not, write to the Free Software Foundation, Inc.,
00020  *  59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
00021  */
00022 #ifndef _GRAPH_TOKENIZER_H
00023 #define _GRAPH_TOKENIZER_H
00024 
00025 #include <fstream>
00026 #include "graph_vat.h"
00027 #include "element_parser.h"
00028 #include "generic_classes.h"
00029 #include "tokenizer_utils.h"
00030 
00031 using namespace std;
00032 
00033 
00041 template<typename PP, typename MP, typename TP, typename PAT_ST, template<class, typename, typename, template <typename> class> class CC, 
00042 template <typename> class ALLOC >
00043 class tokenizer<GRAPH_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00044 {
00045 public:
00046   typedef vat<GRAPH_PROP, V_Fk1_MINE_PROP, ALLOC, std::vector > VAT;
00047   
00048   tokenizer(const int max=LINE_SZ): MAXLINE(max) {} 
00055   template<class SM_T>
00056   int parse_next_trans(ifstream& infile, pat_fam<GRAPH_PATTERN>& freq_pats, storage_manager<GRAPH_PATTERN, 
00057                        VAT, ALLOC, SM_T>& vat_hmap) {
00058     char* line=new char[MAXLINE];
00059     char word[MAXLINE];
00060     char* startline=line;
00061     
00062     int len;
00063     int count; //# of words parsed from line
00064     int tid=-1;
00065     int num_items=-1; //# of words to be read from this line
00066     int pos; //stores starting position of input stream's get pointer
00067     VAT* gvat;
00068     GRAPH_PATTERN* g1;
00069     
00070     map<int, typename GRAPH_PATTERN::VERTEX_T> vid_to_lbl; // map from vertex-id 
00071                                  // to its label
00072     typename map<int, typename GRAPH_PATTERN::VERTEX_T>::iterator tmp_it;
00073     
00074     while(1) {
00075       pos=infile.tellg();
00076       line=startline;
00077       *line='\0';
00078       infile.getline(line, MAXLINE-1);
00079       len=strlen(line);
00080       if(!len || !line) {
00081         delete[] startline;
00082         return tid;
00083       }
00084       
00085       line[len++]='\0';
00086       count=0;
00087       
00088       if(line[0]=='#') // comment line
00089         continue;
00090       
00091       if(!(line=parse_word()(line, word))) {
00092         //parse_word() failed
00093         delete[] startline;
00094         return -1;
00095       }
00096       
00097       if(word[0]=='t') { // this is the tid line
00098         if(tid!=-1) { // this is a new tid, stop here
00099           infile.seekg(pos);
00100           delete[] startline;
00101           return tid; // this is the line from where function should 
00102                 // return on most calls 
00103         }
00104         
00105         line=parse_word()(line, word); // read in the '#'
00106         if(!line) {
00107           //parse_word() failed
00108           delete[] startline;
00109           return -1;
00110         }
00111         
00112         line=parse_word()(line, word); // read in the tid
00113         if(!line) {
00114           //parse_word() failed
00115           delete[] startline;
00116           return -1;
00117         }
00118         tid=atoi(word);
00119         
00120       }//if word[0]=='t'
00121       else if(word[0]=='v') { // this is a vid-line
00122         num_items=2; // 2 more words to be parsed from this line
00123         int vid=0;
00124         typename GRAPH_PATTERN::VERTEX_T v_lbl;
00125         
00126         while(count<num_items) {
00127           if(!(line=parse_word()(line, word))) {
00128             // parse_word() failed
00129             delete[] startline;
00130             return -1;
00131           }
00132           switch(count) {
00133             case 0: vid=atoi(word); break;
00134             case 1:
00135               v_lbl=el_prsr.parse_element(word); 
00142               vid_to_lbl.insert(make_pair(vid, v_lbl));
00143           }
00144           count++;
00145           
00146         }//while(count<..)
00147         
00148       }//if word[0]=='v'
00149       else if(word[0]=='e') { // undirected edge
00153         int vid1, vid2;
00154         typename GRAPH_PATTERN::EDGE_T e_lbl;
00155         typename GRAPH_PATTERN::VERTEX_T v_lbl1, v_lbl2;
00156         num_items=3; // 3 more words to be parsed
00157         bool swap_vids; // flag=false if v_lbl1<v_lbl2
00158         
00159         while(count<num_items) {
00160           if(!(line=parse_word()(line, word))) {
00161             // parse_word() failed
00162             delete[] startline;
00163             return -1;
00164           }
00165           
00166           switch(count) {
00167             case 0: 
00168               vid1=atoi(word); 
00169               if((tmp_it=vid_to_lbl.find(vid1))==vid_to_lbl.end()) {
00170                 cerr<<"graph_tokenizer.parse_next_trans: vid "<<vid1<<" not found in vid_to_lbl"<<endl;
00171                 return -1;
00172               }
00173                 v_lbl1=tmp_it->second;
00174               break;
00175               
00176             case 1: 
00177               vid2=atoi(word);
00178               if((tmp_it=vid_to_lbl.find(vid2))==vid_to_lbl.end()) {
00179                 cerr<<"graph_tokenizer.parse_next_trans: vid "<<vid2<<" not found in vid_to_lbl"<<endl;
00180                 return -1;
00181               }
00182                 v_lbl2=tmp_it->second;
00183               break;
00184               
00185             case 2: 
00186               e_lbl=edge_prsr.parse_element(word); 
00193               
00195               g1=new GRAPH_PATTERN;
00196               if(v_lbl1<=v_lbl2) {
00197                 make_edge(g1, v_lbl1, v_lbl2, e_lbl);
00198                 swap_vids=0;
00199               }
00200                 else {
00201                   make_edge(g1, v_lbl2, v_lbl1, e_lbl);
00202                   swap_vids=1;
00203                 }
00204                 
00209                 
00210                 if(!(gvat=vat_hmap.get_vat(g1))) { // vat not found
00211                   gvat=new VAT;
00212                   if(!swap_vids)
00213                     gvat->insert_occurrence_tid(tid, make_pair(vid1, vid2));
00214                   else
00215                     gvat->insert_occurrence_tid(tid, make_pair(vid2, vid1));
00216                   
00217                   gvat->insert_vid_tid(vid1);
00218                   gvat->insert_vid(vid2);
00219                   vat_hmap.add_vat(g1, gvat); // add pattern-vat mapping
00220                   freq_pats.push_back(g1); // this is the first time 
00221                                // this pattern has been encountered, so add it
00222                 }
00223                 else if(gvat->back().first!=tid) { // or, new tid
00224                   if(!swap_vids)
00225                     gvat->insert_occurrence_tid(tid, make_pair(vid1, vid2));
00226                   else
00227                     gvat->insert_occurrence_tid(tid, make_pair(vid2, vid1));
00228                   
00229                   gvat->insert_vid_tid(vid1);
00230                   gvat->insert_vid(vid2);
00231                   delete g1;
00232                 }
00233                 else { // assert: gvat->back().first=tid
00234                   if(!swap_vids)
00235                     gvat->insert_occurrence(make_pair(vid1, vid2));
00236                   else
00237                     gvat->insert_occurrence(make_pair(vid2, vid1));
00238                   
00239                   gvat->insert_vid_hs(vid1);
00240                   gvat->insert_vid(vid2);
00241                   delete g1;
00242                 }
00243                 
00244           }//switch
00245           count++;
00246         }//while(count<..)
00247         
00248       }//if(word[0]=='u')
00249       else {
00250         cerr<<"graph.tokenizer.parse_next_trans: Unidentifiable line="<<line<<endl;
00251         return -1;
00252       }
00253     }//while(1)
00254     
00255     return tid;
00256     
00257   }//parse_next_trans()
00258   
00259 private:
00260   int MAXLINE; 
00261   element_parser<typename GRAPH_PATTERN::VERTEX_T> el_prsr; 
00262   element_parser<typename GRAPH_PATTERN::EDGE_T> edge_prsr; 
00264 }; //end class tokenizer
00265 
00266 #endif
00267 

Generated on Wed Jul 26 14:01:08 2006 for DMTL by  doxygen 1.4.7