tree_tokenizer.h

00001 /*
00002  *  Copyright (C) 2005 M.J. Zaki <zaki@cs.rpi.edu> Rensselaer Polytechnic Institute
00003  *  Written by parimi@cs.rpi.edu
00004  *  Updated by chaojv@cs.rpi.edu, alhasan@cs.rpi.edu, salems@cs.rpi.edu
00005  *  Modifications:
00006  *    Added tokenizer properties -- Zaki, 5/8/06
00007  *
00008  *  This program is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU General Public License
00010  *  as published by the Free Software Foundation; either version 2
00011  *  of the License, or (at your option) any later version.
00012  *
00013  *  This program is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  *  GNU General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU General Public License along
00019  *  with this program; if not, write to the Free Software Foundation, Inc.,
00020  *  59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
00021  */
00022 // tokenizer for trees
00023 
00024 #ifndef _TREE_TOKENIZER_H
00025 #define _TREE_TOKENIZER_H
00026 
00027 #include "pattern.h"
00028 #include "pat_fam.h"
00029 #include "tree_vat.h"
00030 #include "element_parser.h"
00031 #include "generic_classes.h"
00032 #include "tokenizer_utils.h"
00033 
00041 template<typename PP, typename MP, typename TP, typename PAT_ST, template<typename, typename, typename, template <typename> class > class CC,
00042 template <typename> class ALLOC >
00043 class tokenizer<TREE_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00044 {
00045   
00046 public:
00047   typedef vat<TREE_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector> VAT;
00048   typedef tree_instance<std::vector, PP > TREE_INSTANCE;
00049   
00050   
00051   tokenizer(const int max=LINE_SZ): MAXLINE(max) {}
00052   
00053   
00060   template<class SM_T>
00061     int parse_next_trans(ifstream& infile, pat_fam<TREE_PATTERN>& freq_pats, storage_manager<TREE_PATTERN, VAT, ALLOC, SM_T>& vat_hmap) {
00062       char* line=new char[MAXLINE];
00063       char word[MAXLINE];
00064       char* startline=line;
00065       const int BK_TRK=-1; // backtrack symbol for trees
00066       
00067       int len;
00068       int count; //# of words parsed from line
00069       int tid=-1;
00070       int dfs_id=-1;
00071       int num_items=3; //# of objects on this transaction
00072       VAT* tvat;
00073       TREE_PATTERN* p;
00074       int depth; // maintains current depth in tree
00075       vector<pair<typename TREE_PATTERN::VERTEX_T, pair<int, bool> > > obj_data;
00076       // stores info for scope computation for generating level-1 vats
00077       // the first int in 2nd pair is current scope for pattern
00078       // bool is if the object is still in scope
00079       
00080       typedef element_parser<typename TREE_PATTERN::VERTEX_T> V_P;
00081       
00082       *line='\0';
00083       infile.getline(line, MAXLINE-1);
00084       len=strlen(line);
00085       if(!len || !line) {
00086         delete[] startline;
00087         return -1;
00088       }
00089       
00090       line[len++]='\0';
00091       count=0;
00092       depth=0;
00093       //obj_data.reserve(MAXLINE-1);
00094       
00095       while(count<num_items+3 && line<(startline+len)) {
00096         if(!(line=parse_word()(line, word))) {
00097           //parse_word() failed
00098           delete[] startline;
00099           return -1;
00100         }
00101         count++;
00102         
00103         switch(count) {
00104           case 1:
00105             //this is tid
00106             tid=atoi(word); break;
00107             
00108           case 2: 
00109             //this is cid
00110             break;        
00111           case 3:
00112             //this is # of elements on line
00113             num_items=atoi(word);
00114             break;
00115             
00116           default:
00117             typename TREE_PATTERN::VERTEX_T v=el_prsr.parse_element(word);
00118             
00119             if(V_P::notEq(v, V_P::convert(BK_TRK))) {
00120               dfs_id++;
00121               depth++;
00122               for(unsigned int i=0; i<obj_data.size(); i++)
00123                 if(obj_data[i].second.second)
00124                   obj_data[i].second.first++;
00125               obj_data.push_back(make_pair(v, make_pair(0, 1)));
00126             }
00127               else {
00128                 depth--;
00129                 for(unsigned int i=0; i<obj_data.size(); i++) {
00130                   obj_data[i].second.first--;
00131                   if(obj_data[i].second.second && (obj_data[i].second.first<0)) {
00132                     // add this pattern & scope to VAT
00133                     obj_data[i].second.second=0;
00134                     p=new TREE_PATTERN;
00135                     p->add_vertex(obj_data[i].first);
00136                     p->init_canonical_code(obj_data[i].first);
00137                     
00138                     if((tvat=vat_hmap.get_vat(p))) {
00139                       // vat found, check if this tid occurs
00140                       typename VAT::IT it=tvat->end()-1;
00141                       if(tvat->size() && it->first==tid)
00142                         it->second.push_back(TREE_INSTANCE(i, dfs_id, depth, 1));
00143                       else {
00144                         typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00145                         vec_inst.push_back(TREE_INSTANCE(i, dfs_id, depth, 1));
00146                         tvat->push_back(make_pair(tid, vec_inst));
00147                       }
00148                       delete p;
00149                     }
00150                     else {
00151                       // vat not found, create new one and insert it
00152                       tvat=new VAT;
00153                       typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00154                       vec_inst.push_back(TREE_INSTANCE(i, dfs_id, depth, 1));
00155                       tvat->push_back(make_pair(tid, vec_inst));
00156                       if(!vat_hmap.add_vat(p, tvat)) {
00157                         cerr<<"tokenizer.get_length_one(tree): add_vat failed"<<endl;
00158                         return -1;
00159                       }
00160                       freq_pats.push_back(p);
00161                     }
00162                     
00163                   }//end if obj_data..<0
00164                 }//end for i
00165               }//end else v!=BK_TRK
00166               
00167         }//end switch
00168         
00169       }//end while(count<..)
00170       
00171       // special case: root of tree has not been added yet
00172       if(!obj_data[0].second.second) {
00173         cerr<<"tokenizer.get_length_one(tree): root of tree has invalid scope"<<endl;
00174         return -1;
00175       }
00176       
00177       // add its pattern & scope to VAT
00178       obj_data[0].second.second=0;
00179       p=new TREE_PATTERN;
00180       p->add_vertex(obj_data[0].first);
00181       p->init_canonical_code(obj_data[0].first);
00182       
00183       if((tvat=vat_hmap.get_vat(p))) {
00184         // vat found, check if this tid occurs
00185         typename VAT::IT it=tvat->end()-1;
00186         if(tvat->size() && it->first==tid)
00187           it->second.push_back(TREE_INSTANCE(0, dfs_id, 0, 1));
00188         else {
00189           typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00190           vec_inst.push_back(TREE_INSTANCE(0, dfs_id, 0, 1));
00191           tvat->push_back(make_pair(tid, vec_inst));
00192         }
00193         delete p;
00194       }
00195       else {
00196         // vat not found, create new one and insert it
00197         tvat=new VAT;
00198         typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00199         vec_inst.push_back(TREE_INSTANCE(0, dfs_id, 0, 1));
00200         tvat->push_back(make_pair(tid, vec_inst));
00201         if(!vat_hmap.add_vat(p, tvat)) {
00202           cerr<<"tokenizer.get_length_one(tree): add_vat failed"<<endl;
00203           return -1;
00204         }
00205         freq_pats.push_back(p);
00206       }
00207       
00208       //cout<<"tokenizer(trees).parse_next_trans exit"<<endl;      
00209       delete[] startline;
00210       return tid;
00211       
00212     }//end parse_next_trans()
00213   
00214 private:
00215     const int MAXLINE;
00216   element_parser<typename TREE_PATTERN::VERTEX_T> el_prsr;
00217   
00218 };//end class tokenizer for trees
00219 
00220 #endif

Generated on Wed Jul 26 14:01:08 2006 for DMTL by  doxygen 1.4.7