00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023 #ifndef _TOKENIZER
00024 #define _TOKENIZER
00025
00026 #include "seq_can_code.h"
00027 #include "adj_list.h"
00028
00029 #include "generic_classes.h"
00030 #include "tokenizer_utils.h"
00031 #include "element_parser.h"
00032 #include "seq_instance.h"
00033 #include "typedefs.h"
00034
00035
00036
00037
00045 template<class PP, typename MP, typename TP, typename PAT_ST, template<typename, typename, typename, template <typename> class > class CC,
00046 template <typename> class ALLOC >
00047 class tokenizer<SEQ_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00048 {
00049
00050 public:
00051 typedef pattern_support<V_Fkk_MINE_PROP> PAT_SUP;
00052 typedef vat<SEQ_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector > VAT;
00053 typedef seq_instance <V_Fkk_MINE_PROP> INSTANCE;
00054 typedef typename SEQ_PATTERN::VERTEX_T V_T;
00055 typedef typename SEQ_PATTERN::EDGE_T E_T;
00056
00057
00058 tokenizer(int max=LINE_SZ): MAXLINE(max) {}
00066 template<class SM_T>
00067 int parse_next_trans(ifstream& infile, pat_fam<SEQ_PATTERN>& freq_pats, storage_manager<SEQ_PATTERN, VAT, ALLOC, SM_T>& vat_hmap ) {
00068
00069 char* line=new char[MAXLINE];
00070 char word[MAXLINE];
00071 char* startline=line;
00072
00073 int len;
00074 int count;
00075 int tid=-1, ts=0;
00076 int num_items=3;
00077 int pos;
00078 int sequence_pos = 0;
00079
00080 VAT* svat;
00081
00082 do {
00083 pos=infile.tellg();
00084 line=startline;
00085 *line='\0';
00086 infile.getline(line, MAXLINE-1);
00087 len=strlen(line);
00088 if(!len || !line) {
00089 delete[] startline;
00090 return tid;
00091 }
00092
00093 line[len++]='\0';
00094 count=0;
00095
00096 while(count<num_items+3 && line<(startline+len)) {
00097 if(!(line=parse_word()(line, word))) {
00098
00099 delete[] startline;
00100 return -1;
00101 }
00102 count++;
00103
00104 switch(count) {
00105 case 1:
00106
00107 if(tid!=-1 && tid!=atoi(word)) {
00108
00109 infile.seekg(pos);
00110 delete[] startline;
00111 return tid;
00112 }
00113 tid=atoi(word);
00114 break;
00115 case 2:
00116 ts=atoi(word);
00117 sequence_pos++;
00118 break;
00119 case 3:
00120
00121 num_items=atoi(word);
00122 break;
00123
00124 default:
00125
00126 SEQ_PATTERN* p = new SEQ_PATTERN();
00127
00128
00129 V_T v =el_prsr.parse_element(word);
00130
00131
00132 p->add_vertex(v);
00133 p->init_canonical_code(v);
00134
00135
00136
00137
00138 svat=vat_hmap.get_vat(p);
00139
00140 if(svat != NULL) {
00141
00142
00143 typename VAT::IT vit=svat->end()-1;
00144 if(vit->first!=tid)
00145 vit=svat->end();
00146
00147
00148 if(vit!=svat->end())
00149
00150 vit->second.push_back(INSTANCE(ts,sequence_pos));
00151 else {
00152
00153 typename VAT::INSTANCES new_tidlist;
00154 new_tidlist.push_back(INSTANCE(ts,sequence_pos));
00155 svat->push_back(make_pair(tid, new_tidlist));
00156 }
00157
00158 delete p;
00159
00160 }
00161 else {
00162
00163 svat=new VAT();
00164 typename VAT::INSTANCES new_tidlist;
00165 new_tidlist.push_back(INSTANCE(ts,sequence_pos));
00166 svat->push_back(make_pair(tid, new_tidlist));
00167 if(!vat_hmap.add_vat(p, svat)) {
00168 cerr<<"tokenizer.get_length_one: add_vat failed"<<endl;
00169 return -1;
00170 }
00171 freq_pats.push_back(p);
00172 }
00173
00174 }
00175
00176 }
00177
00178 }while(true);
00179
00180 return -1;
00181 }
00182
00183 private:
00184 int MAXLINE;
00185 element_parser<V_T> el_prsr;
00187 };
00188
00189 template<class PP, typename MP, typename TP, typename PAT_ST, template<typename, typename, typename, template <typename> class > class CC,
00190 template <typename> class ALLOC >
00191 class tokenizer<SEQ_PATTERN, FASTA_TKNZ_PROP, ALLOC >
00192 {
00193
00194 public:
00195 typedef pattern_support<V_Fkk_MINE_PROP> PAT_SUP;
00196 typedef vat<SEQ_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector > VAT;
00197 typedef seq_instance <V_Fkk_MINE_PROP> INSTANCE;
00198 typedef typename SEQ_PATTERN::VERTEX_T V_T;
00199 typedef typename SEQ_PATTERN::EDGE_T E_T;
00200
00201
00202 tokenizer(int max=LINE_SZ): MAXLINE(max) {}
00210 template<class SM_T>
00211 int parse_next_trans(ifstream& infile, pat_fam<SEQ_PATTERN>& freq_pats,
00212 storage_manager<SEQ_PATTERN, VAT, ALLOC, SM_T>& vat_hmap ) {
00213
00214 char* line=new char[MAXLINE];
00215 char word[MAXLINE];
00216 char* startline=line;
00217
00218 int i=0, len, seqlen=0;
00219 static int tid=-1;
00220 int pos;
00221 VAT* svat;
00222 bool first = true;
00223
00224 do {
00225 pos=infile.tellg();
00226 line=startline;
00227 *line = '\0';
00228 infile.getline(line, MAXLINE-1);
00229
00230 len = strlen(line);
00231
00232 if(len == 0){
00233 if (infile.eof()) {
00234 tid= -1;
00235 delete[] startline;
00236 return tid;
00237 }
00238 else continue;
00239 }
00240
00241
00242 if (line[0] == '>'){
00243 if (first){
00244 tid++;
00245 first = false;
00246 continue;
00247 }
00248 else{
00249 infile.seekg(pos);
00250
00251 delete[] startline;
00252 return tid;
00253 }
00254 }
00255
00256
00257 for (i=0; i < len; ++i, ++seqlen){
00258
00259
00260 SEQ_PATTERN* p = new SEQ_PATTERN();
00261 V_T v = string(1,line[i]);
00262
00263
00264
00265 p->add_vertex(v);
00266 p->init_canonical_code(v);
00267
00268
00269
00270 svat=vat_hmap.get_vat(p);
00271
00272 if(svat != NULL) {
00273
00274
00275 typename VAT::IT vit=svat->end()-1;
00276 if(vit->first!=tid)
00277 vit=svat->end();
00278
00279 if(vit!=svat->end())
00280
00281 vit->second.push_back(INSTANCE(seqlen, seqlen));
00282 else {
00283
00284 typename VAT::INSTANCES new_tidlist;
00285 new_tidlist.push_back(INSTANCE(seqlen, seqlen));
00286 svat->push_back(make_pair(tid, new_tidlist));
00287 }
00288
00289 delete p;
00290
00291 }
00292 else {
00293
00294 svat=new VAT();
00295 typename VAT::INSTANCES new_tidlist;
00296 new_tidlist.push_back(INSTANCE(seqlen, seqlen));
00297 svat->push_back(make_pair(tid, new_tidlist));
00298 if(!vat_hmap.add_vat(p, svat)) {
00299 cerr<<"tokenizer.get_length_one: add_vat failed"<<endl;
00300 return -1;
00301 }
00302 freq_pats.push_back(p);
00303 }
00304 }
00305
00306 }while(true);
00307
00308 return -1;
00309 }
00310
00311 private:
00312 int MAXLINE;
00314 };
00315
00316 #endif
00317