iset_tokenizer.h

00001 /*
00002  *  Copyright (C) 2005 M.J. Zaki <zaki@cs.rpi.edu> Rensselaer Polytechnic Institute
00003  *  Written by parimi@cs.rpi.edu
00004  *  Updated by chaojv@cs.rpi.edu, alhasan@cs.rpi.edu, salems@cs.rpi.edu
00005  *  Modifications:
00006  *    Added tokenizer properties -- Zaki, 5/8/06
00007  *
00008  *  This program is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU General Public License
00010  *  as published by the Free Software Foundation; either version 2
00011  *  of the License, or (at your option) any later version.
00012  *
00013  *  This program is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  *  GNU General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU General Public License along
00019  *  with this program; if not, write to the Free Software Foundation, Inc.,
00020  *  59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
00021  */
00022 #ifndef _ISET_TOKENIZER_H_
00023 #define _ISET_TOKENIZER_H_
00024 
00025 #include <fstream>
00026 
00027 #include "iset_can_code.h"
00028 #include "adj_list.h"
00029 #include "generic_classes.h"
00030 #include "tokenizer_utils.h"
00031 #include "typedefs.h"
00032 
00033 #include "element_parser.h"
00034 #include "pattern.h"
00035 #include "pat_fam.h"
00036 
00037 /* NOTE: the parsing  scheme reads atmost the first MAXLINE chars of a line
00038 this can perhaps be improved towards a better one */
00039 
00047 template<class PP, typename MP,  typename TP, typename PAT_ST, 
00048 template<class, typename, typename, template <typename> class> class CC, 
00049 template <typename> class ALLOC >
00050 class tokenizer<ISET_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00051 {
00052 
00053 public: 
00054   typedef pattern_support<V_Fkk_MINE_PROP> PAT_SUP;
00055   typedef vat<ISET_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector > VAT;
00056   typedef typename ISET_PATTERN::VERTEX_T V_T;
00057   typedef typename ISET_PATTERN::EDGE_T E_T;
00058   
00059   typedef int VAT_T; 
00061   tokenizer(int max=LINE_SZ): MAXLINE(max) {} 
00069   template<class SM_T>
00070   int parse_next_trans(std::ifstream& infile, pat_fam<ISET_PATTERN>& freq_pats, 
00071                        storage_manager<ISET_PATTERN, VAT, ALLOC, SM_T>& vat_hmap) {
00072       
00073     char* line=new char[MAXLINE];
00074     char word[MAXLINE];
00075     char* startline=line;
00076     
00077     int len;
00078     int count; //# of words parsed from line
00079     int tid=-1, ts;
00080     int num_items=3; //# of itemsets on this transaction
00081     VAT* ivat;
00082     
00083     *line='\0';
00084     infile.getline(line, MAXLINE-1);
00085     len=strlen(line);
00086     if(!len || !line) {
00087       delete[] startline;
00088       return -1;
00089     }
00090     
00091     line[len++]='\0';
00092     count=0;
00093     while(count<num_items+3 && line<(startline+len)) {
00094       
00095       if(!(line=parse_word()(line, word))) {
00096         //parse_word() failed
00097         delete[] startline;
00098         return -1;
00099       }
00100       count++;
00101       
00102       switch(count) {
00103         case 1:
00104           //this is tid
00105           tid=atoi(word); break;
00106           
00107         case 2: 
00108           //this is timestamp
00109           ts=atoi(word); break;
00110           
00111         case 3:
00112           //this is # of elements on line
00113           num_items=atoi(word);
00114           break;
00115           
00116         default:
00117           //this is an element, insert/append to its VAT
00118           ISET_PATTERN* p = new ISET_PATTERN();
00119           // Add vertex to the empty graph.
00120           V_T v = el_prsr.parse_element(word);
00121           p->add_vertex(v);
00122           p->init_canonical_code(v);
00123           
00124           //if p contains a vat in vat_hmap, append tid to the entry
00125           //else create a new vat and insert it into vat_hmap,
00126           //and add p to freq_pats
00127           ivat=vat_hmap.get_vat(p);
00128           if(vat_hmap.find(p)) {
00129             //vat found
00130             ivat->push_back(tid);
00131             delete p;
00132           }
00133           else {
00134             //create a new vat & insert it
00135             ivat=new VAT();
00136             ivat->push_back(tid);
00137               
00138             if(!vat_hmap.add_vat(p, ivat)) {
00139               cerr<<"tokenizer.get_length_one: add_vat failed"<<endl;
00140               return -1;
00141             }
00142               
00143             freq_pats.push_back(p);
00144           } //end else
00145             
00146       } //end switch
00147       
00148     } //end while(count<..)
00149     
00150     delete[] startline;
00151     
00152     return tid;
00153   } //end parse_next_trans()
00154   
00155   
00156   
00157 private:
00158   int MAXLINE; 
00159   element_parser<V_T> el_prsr; 
00161 }; //end class tokenizer<itemset>
00162 
00163 #endif

Generated on Wed Jul 26 14:01:08 2006 for DMTL by  doxygen 1.4.7