mset_tokenizer.h

00001 /*
00002  *  Copyright (C) 2005 M.J. Zaki <zaki@cs.rpi.edu> Rensselaer Polytechnic Institute
00003  *  Written by parimi@cs.rpi.edu
00004  *  Updated by chaojv@cs.rpi.edu, alhasan@cs.rpi.edu, salems@cs.rpi.edu
00005  *  Modifications:
00006  *    Added tokenizer properties -- Zaki, 5/8/06
00007  *
00008  *  This program is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU General Public License
00010  *  as published by the Free Software Foundation; either version 2
00011  *  of the License, or (at your option) any later version.
00012  *
00013  *  This program is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  *  GNU General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU General Public License along
00019  *  with this program; if not, write to the Free Software Foundation, Inc.,
00020  *  59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
00021  */
00022 #ifndef _MSET_TOKENIZER_H_
00023 #define _MSET_TOKENIZER_H_
00024 
00025 #include <fstream>
00026 
00027 #include "mset_can_code.h"
00028 #include "adj_list.h"
00029 #include "generic_classes.h"
00030 #include "tokenizer_utils.h"
00031 #include "typedefs.h"
00032 
00033 #include "element_parser.h"
00034 #include "pattern.h"
00035 #include "pat_fam.h"
00036 
00037 /* NOTE: the parsing scheme reads atmost the first MAXLINE chars of a line
00038    this can perhaps be improved towards a better one */
00039 
00047 template<class PP, typename MP, typename TP, typename PAT_ST, template<typename, typename, typename, template <typename> class> class CC, 
00048          template <typename> class ALLOC >
00049 class tokenizer<MSET_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00050 {
00051 
00052  public: 
00053   typedef pattern_support<V_Fkk_MINE_PROP> PAT_SUP;
00054   typedef vat<MSET_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector > VAT;
00055   typedef typename MSET_PATTERN::VERTEX_T V_T;
00056   typedef typename MSET_PATTERN::EDGE_T E_T;
00057 
00058   typedef int VAT_T; 
00060   tokenizer(int max=LINE_SZ): MAXLINE(max) {} 
00069   template<class SM_T>
00070   int parse_next_trans(std::ifstream& infile, pat_fam<MSET_PATTERN>& freq_pats, storage_manager<MSET_PATTERN, VAT, ALLOC, SM_T>& vat_hmap) {
00071 
00072     char* line=new char[MAXLINE];
00073     char word[MAXLINE];
00074     char* startline=line;
00075     
00076     int len;
00077     int count; //# of words parsed from line
00078     int tid=-1, ts;
00079     int num_items=3; //# of itemsets on this transaction
00080     
00081     VAT* ivat;
00082 
00083     *line='\0';
00084     infile.getline(line, MAXLINE-1);
00085     len=strlen(line);
00086     if(!len || !line) {
00087       delete[] startline;
00088       return -1;
00089     }
00090 
00091     line[len++]='\0';
00092     count=0;
00093     while(count<num_items+3 && line<(startline+len)) {
00094 
00095       if(!(line=parse_word()(line, word))) {
00096         //parse_word() failed
00097         delete[] startline;
00098         return -1;
00099       }
00100       count++;
00101 
00102       switch(count) {
00103         case 1:
00104           //this is tid
00105           tid=atoi(word); break;
00106       
00107         case 2: 
00108           //this is timestamp
00109           ts=atoi(word); break;
00110 
00111         case 3:
00112           //this is # of elements on line
00113           num_items=atoi(word);
00114           break;
00115  
00116         default:
00117           //this is an element, insert/append to its VAT
00118           MSET_PATTERN* p = new MSET_PATTERN();
00119           // Add vertex to the empty graph.
00120           V_T v = el_prsr.parse_element(word);
00121           p->add_vertex(v);
00122           p->init_canonical_code(v);
00123 
00124             //if p contains a vat in vat_hmap, append tid to the entry
00125             //else create a new vat and insert it into vat_hmap,
00126             //and add p to freq_pats
00127             ivat=vat_hmap.get_vat(p);
00128             if(vat_hmap.find(p)) {
00129             //vat found
00130 
00131             if(tid == (*ivat)[ivat->size()-1].first) {
00132               (*ivat)[ivat->size()-1].second++;
00133             } else {
00134               ivat->push_back(make_pair(tid, 1));
00135             }
00136             delete p;
00137           }
00138           else {
00139             //create a new vat & insert it
00140             ivat=new VAT();
00141             ivat->push_back(make_pair(tid, 1));
00142 
00143             if(!vat_hmap.add_vat(p, ivat)) {
00144               cerr<<"tokenizer.get_length_one: add_vat failed"<<endl;
00145               return -1;
00146             }
00147 
00148             freq_pats.push_back(p);
00149           } //end else
00150 
00151       } //end switch
00152 
00153     } //end while(count<..)
00154      
00155     delete[] startline;
00156 
00157     return tid;
00158   } //end parse_next_trans()
00159 
00160 
00161 
00162  private:
00163     int MAXLINE; 
00164     element_parser<V_T> el_prsr; 
00166 }; //end class tokenizer<itemset>
00167 
00168 #endif

Generated on Wed Jul 26 14:01:08 2006 for DMTL by  doxygen 1.4.7