seq_tokenizer.h

00001 /*
00002  *  Copyright (C) 2005 M.J. Zaki <zaki@cs.rpi.edu> Rensselaer Polytechnic Institute
00003  *  Written by parimi@cs.rpi.edu
00004  *  Updated by chaojv@cs.rpi.edu, alhasan@cs.rpi.edu, salems@cs.rpi.edu
00005  *  Modifications:
00006  *    Added tokenizer properties & FASTA tokenizer -- Zaki, 5/8/06
00007  *      Added sequence position for induced occurrences -- zaki, 5/11/06
00008  *
00009  *  This program is free software; you can redistribute it and/or
00010  *  modify it under the terms of the GNU General Public License
00011  *  as published by the Free Software Foundation; either version 2
00012  *  of the License, or (at your option) any later version.
00013  *
00014  *  This program is distributed in the hope that it will be useful,
00015  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00016  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00017  *  GNU General Public License for more details.
00018  *
00019  *  You should have received a copy of the GNU General Public License along
00020  *  with this program; if not, write to the Free Software Foundation, Inc.,
00021  *  59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
00022  */
00023 #ifndef _TOKENIZER
00024 #define _TOKENIZER
00025 
00026 #include "seq_can_code.h"
00027 #include "adj_list.h"
00028 
00029 #include "generic_classes.h"
00030 #include "tokenizer_utils.h"
00031 #include "element_parser.h"
00032 #include "seq_instance.h"
00033 #include "typedefs.h"
00034 
00035 /* NOTE: the parsing scheme reads atmost the first MAXLINE chars of a line
00036 this can perhaps be improved towards a better one */
00037 
00045 template<class PP, typename MP, typename TP, typename PAT_ST, template<typename, typename, typename, template <typename> class > class CC, 
00046 template <typename> class ALLOC >
00047 class tokenizer<SEQ_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00048 {
00049   
00050 public:
00051   typedef pattern_support<V_Fkk_MINE_PROP> PAT_SUP;
00052   typedef vat<SEQ_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector > VAT;
00053   typedef seq_instance <V_Fkk_MINE_PROP> INSTANCE;
00054   typedef typename SEQ_PATTERN::VERTEX_T V_T;
00055   typedef typename SEQ_PATTERN::EDGE_T E_T;
00056   
00057   
00058   tokenizer(int max=LINE_SZ): MAXLINE(max) {} 
00066   template<class SM_T>
00067     int parse_next_trans(ifstream& infile, pat_fam<SEQ_PATTERN>& freq_pats, storage_manager<SEQ_PATTERN, VAT, ALLOC, SM_T>& vat_hmap ) {
00068       
00069       char* line=new char[MAXLINE];
00070       char word[MAXLINE];
00071       char* startline=line;
00072       
00073       int len;
00074       int count; //# of words parsed from line
00075       int tid=-1, ts=0;
00076       int num_items=3; //# of itemsets on this transaction
00077       int pos; //stores starting position of input stream's get pointer
00078       int sequence_pos = 0; // position in the sequence
00079       
00080       VAT* svat;
00081       
00082       do {
00083         pos=infile.tellg();
00084         line=startline;
00085         *line='\0';
00086         infile.getline(line, MAXLINE-1);
00087         len=strlen(line);
00088         if(!len || !line) {
00089           delete[] startline;
00090           return tid;
00091         }
00092         
00093         line[len++]='\0';
00094         count=0;
00095         
00096         while(count<num_items+3 && line<(startline+len)) {
00097           if(!(line=parse_word()(line, word))) {
00098             //parse_word() failed
00099             delete[] startline;
00100             return -1;
00101           }
00102           count++;
00103           
00104           switch(count) {
00105             case 1:
00106               //this is tid/oid
00107               if(tid!=-1 && tid!=atoi(word)) {
00108                 // this line is next transaction
00109                 infile.seekg(pos);
00110                 delete[] startline;
00111                 return tid;
00112               }
00113               tid=atoi(word); 
00114               break;              
00115             case 2:               
00116               ts=atoi(word); //this is timestamp
00117               sequence_pos++; //this is the position in the seq
00118               break;        
00119             case 3:
00120               //this is # of elements on line
00121               num_items=atoi(word);
00122               break;
00123               
00124             default:
00125               //this is an element, insert/append to its VAT
00126               SEQ_PATTERN* p = new SEQ_PATTERN();
00127               //cout << "WORD " << tid << " " << ts << " " 
00128               //   << num_items << " " << word << endl;
00129               V_T v =el_prsr.parse_element(word);
00130               
00131               // Add vertex and update the canonical code.
00132               p->add_vertex(v);
00133               p->init_canonical_code(v);
00134               
00135               //if p contains a vat in vat_hmap, append tid/ts to the entry
00136               //else create a new vat and insert it into vat_hmap,
00137               //and add p to freq_pats
00138               svat=vat_hmap.get_vat(p);
00139               //if(vat_hmap.find(p))
00140               if(svat != NULL) {
00141                 //vat found, check if this tid exists in it
00142                 
00143                 typename VAT::IT vit=svat->end()-1;
00144                 if(vit->first!=tid)
00145                   vit=svat->end();
00146                 
00147                 
00148                 if(vit!=svat->end())
00149                   //tid found
00150                   vit->second.push_back(INSTANCE(ts,sequence_pos));
00151                 else {
00152                   //tid not found
00153                   typename VAT::INSTANCES new_tidlist;
00154                   new_tidlist.push_back(INSTANCE(ts,sequence_pos));
00155                   svat->push_back(make_pair(tid, new_tidlist));
00156                 }
00157                 
00158                 delete p;
00159                 
00160               }//end if(vat_hmap.find())
00161                 else {
00162                   //create a new vat & insert it
00163                   svat=new VAT();
00164                   typename VAT::INSTANCES new_tidlist;
00165                   new_tidlist.push_back(INSTANCE(ts,sequence_pos));
00166                   svat->push_back(make_pair(tid, new_tidlist));
00167                   if(!vat_hmap.add_vat(p, svat)) {
00168                     cerr<<"tokenizer.get_length_one: add_vat failed"<<endl;
00169                     return -1;
00170                   }
00171                   freq_pats.push_back(p);
00172                 }//end else
00173                 
00174           }//end switch
00175           
00176         }//end while
00177         
00178       }while(true);
00179       
00180       return -1;
00181     }//end parse_next_trans()
00182   
00183 private:
00184     int MAXLINE; 
00185   element_parser<V_T> el_prsr; 
00187 }; //end class seq_tokenizer
00188 
00189 template<class PP, typename MP, typename TP, typename PAT_ST, template<typename, typename, typename, template <typename> class > class CC, 
00190 template <typename> class ALLOC >
00191 class tokenizer<SEQ_PATTERN, FASTA_TKNZ_PROP, ALLOC >
00192 {
00193   
00194 public:
00195   typedef pattern_support<V_Fkk_MINE_PROP> PAT_SUP;
00196   typedef vat<SEQ_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector > VAT;
00197   typedef seq_instance <V_Fkk_MINE_PROP> INSTANCE;
00198   typedef typename SEQ_PATTERN::VERTEX_T V_T;
00199   typedef typename SEQ_PATTERN::EDGE_T E_T;
00200   
00201   
00202   tokenizer(int max=LINE_SZ): MAXLINE(max) {} 
00210   template<class SM_T>
00211   int parse_next_trans(ifstream& infile, pat_fam<SEQ_PATTERN>& freq_pats, 
00212                        storage_manager<SEQ_PATTERN, VAT, ALLOC, SM_T>& vat_hmap ) {
00213     
00214     char* line=new char[MAXLINE];
00215     char word[MAXLINE];
00216     char* startline=line;
00217     
00218     int i=0, len, seqlen=0;
00219     static int tid=-1;
00220     int pos; //stores starting position of input stream's get pointer
00221     VAT* svat;
00222     bool first = true; //first line of new fasta seq in the file
00223     
00224     do {
00225       pos=infile.tellg();
00226       line=startline;
00227       *line = '\0';
00228       infile.getline(line, MAXLINE-1);
00229       //len=infile.gcount();
00230       len = strlen(line);
00231       
00232       if(len == 0){
00233         if (infile.eof()) {
00234           tid= -1;
00235           delete[] startline;
00236           return tid;
00237         }
00238         else continue; //just a blank line, skip
00239       }
00240       
00241       
00242       if (line[0] == '>'){
00243         if (first){
00244           tid++; // increment the seq id
00245           first = false;
00246           continue; //go onto next line
00247         }
00248         else{
00249           infile.seekg(pos); //reset the file pos to beginning of 
00250                      //line for next seq
00251           delete[] startline;
00252           return tid;
00253         }
00254       }
00255       
00256       //read the fasta seq
00257       for (i=0; i < len; ++i, ++seqlen){
00258         //read each char and insert into VAT
00259         //this is an element, insert/append to its VAT
00260         SEQ_PATTERN* p = new SEQ_PATTERN();
00261         V_T v = string(1,line[i]);
00262         
00263         
00264         // Add vertex and update the canonical code.
00265         p->add_vertex(v);
00266         p->init_canonical_code(v);
00267         
00268         //if p contains a vat in vat_hmap, append tid/ts to the entry
00269         //else create a new vat and insert it into vat_hmap,            //and add p to freq_pats
00270         svat=vat_hmap.get_vat(p);
00271         //if(vat_hmap.find(p))
00272         if(svat != NULL) {
00273           //vat found, check if this tid exists in it
00274           
00275           typename VAT::IT vit=svat->end()-1;
00276           if(vit->first!=tid)
00277             vit=svat->end();      
00278           
00279           if(vit!=svat->end())
00280             //tid found
00281             vit->second.push_back(INSTANCE(seqlen, seqlen));
00282           else {
00283             //tid not found
00284             typename VAT::INSTANCES new_tidlist;
00285             new_tidlist.push_back(INSTANCE(seqlen, seqlen));
00286             svat->push_back(make_pair(tid, new_tidlist));
00287           }
00288           
00289           delete p;
00290           
00291         }//end if(vat_hmap.find())
00292         else {
00293           //create a new vat & insert it
00294           svat=new VAT();
00295           typename VAT::INSTANCES new_tidlist;
00296           new_tidlist.push_back(INSTANCE(seqlen, seqlen));
00297           svat->push_back(make_pair(tid, new_tidlist));
00298           if(!vat_hmap.add_vat(p, svat)) {
00299             cerr<<"tokenizer.get_length_one: add_vat failed"<<endl;
00300             return -1;
00301           }
00302           freq_pats.push_back(p);
00303         }//end else
00304       }
00305       
00306     }while(true);
00307     
00308     return -1;
00309   }//end parse_next_trans()
00310   
00311 private:
00312     int MAXLINE; 
00314 }; //end class seq_tokenizer
00315 
00316 #endif
00317 

Generated on Wed Jul 26 14:01:08 2006 for DMTL by  doxygen 1.4.7