00001
00002
00003
00004
00005
00006
00007
00008
00009
00010
00011
00012
00013
00014
00015
00016
00017
00018
00019
00020
00021
00022
00023
00024 #ifndef _TREE_TOKENIZER_H
00025 #define _TREE_TOKENIZER_H
00026
00027 #include "pattern.h"
00028 #include "pat_fam.h"
00029 #include "tree_vat.h"
00030 #include "element_parser.h"
00031 #include "generic_classes.h"
00032 #include "tokenizer_utils.h"
00033
00041 template<typename PP, typename MP, typename TP, typename PAT_ST, template<typename, typename, typename, template <typename> class > class CC,
00042 template <typename> class ALLOC >
00043 class tokenizer<TREE_PATTERN, DMTL_TKNZ_PROP, ALLOC >
00044 {
00045
00046 public:
00047 typedef vat<TREE_PROP, V_Fkk_MINE_PROP, ALLOC, std::vector> VAT;
00048 typedef tree_instance<std::vector, PP > TREE_INSTANCE;
00049
00050
00051 tokenizer(const int max=LINE_SZ): MAXLINE(max) {}
00052
00053
00060 template<class SM_T>
00061 int parse_next_trans(ifstream& infile, pat_fam<TREE_PATTERN>& freq_pats, storage_manager<TREE_PATTERN, VAT, ALLOC, SM_T>& vat_hmap) {
00062 char* line=new char[MAXLINE];
00063 char word[MAXLINE];
00064 char* startline=line;
00065 const int BK_TRK=-1;
00066
00067 int len;
00068 int count;
00069 int tid=-1;
00070 int dfs_id=-1;
00071 int num_items=3;
00072 VAT* tvat;
00073 TREE_PATTERN* p;
00074 int depth;
00075 vector<pair<typename TREE_PATTERN::VERTEX_T, pair<int, bool> > > obj_data;
00076
00077
00078
00079
00080 typedef element_parser<typename TREE_PATTERN::VERTEX_T> V_P;
00081
00082 *line='\0';
00083 infile.getline(line, MAXLINE-1);
00084 len=strlen(line);
00085 if(!len || !line) {
00086 delete[] startline;
00087 return -1;
00088 }
00089
00090 line[len++]='\0';
00091 count=0;
00092 depth=0;
00093
00094
00095 while(count<num_items+3 && line<(startline+len)) {
00096 if(!(line=parse_word()(line, word))) {
00097
00098 delete[] startline;
00099 return -1;
00100 }
00101 count++;
00102
00103 switch(count) {
00104 case 1:
00105
00106 tid=atoi(word); break;
00107
00108 case 2:
00109
00110 break;
00111 case 3:
00112
00113 num_items=atoi(word);
00114 break;
00115
00116 default:
00117 typename TREE_PATTERN::VERTEX_T v=el_prsr.parse_element(word);
00118
00119 if(V_P::notEq(v, V_P::convert(BK_TRK))) {
00120 dfs_id++;
00121 depth++;
00122 for(unsigned int i=0; i<obj_data.size(); i++)
00123 if(obj_data[i].second.second)
00124 obj_data[i].second.first++;
00125 obj_data.push_back(make_pair(v, make_pair(0, 1)));
00126 }
00127 else {
00128 depth--;
00129 for(unsigned int i=0; i<obj_data.size(); i++) {
00130 obj_data[i].second.first--;
00131 if(obj_data[i].second.second && (obj_data[i].second.first<0)) {
00132
00133 obj_data[i].second.second=0;
00134 p=new TREE_PATTERN;
00135 p->add_vertex(obj_data[i].first);
00136 p->init_canonical_code(obj_data[i].first);
00137
00138 if((tvat=vat_hmap.get_vat(p))) {
00139
00140 typename VAT::IT it=tvat->end()-1;
00141 if(tvat->size() && it->first==tid)
00142 it->second.push_back(TREE_INSTANCE(i, dfs_id, depth, 1));
00143 else {
00144 typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00145 vec_inst.push_back(TREE_INSTANCE(i, dfs_id, depth, 1));
00146 tvat->push_back(make_pair(tid, vec_inst));
00147 }
00148 delete p;
00149 }
00150 else {
00151
00152 tvat=new VAT;
00153 typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00154 vec_inst.push_back(TREE_INSTANCE(i, dfs_id, depth, 1));
00155 tvat->push_back(make_pair(tid, vec_inst));
00156 if(!vat_hmap.add_vat(p, tvat)) {
00157 cerr<<"tokenizer.get_length_one(tree): add_vat failed"<<endl;
00158 return -1;
00159 }
00160 freq_pats.push_back(p);
00161 }
00162
00163 }
00164 }
00165 }
00166
00167 }
00168
00169 }
00170
00171
00172 if(!obj_data[0].second.second) {
00173 cerr<<"tokenizer.get_length_one(tree): root of tree has invalid scope"<<endl;
00174 return -1;
00175 }
00176
00177
00178 obj_data[0].second.second=0;
00179 p=new TREE_PATTERN;
00180 p->add_vertex(obj_data[0].first);
00181 p->init_canonical_code(obj_data[0].first);
00182
00183 if((tvat=vat_hmap.get_vat(p))) {
00184
00185 typename VAT::IT it=tvat->end()-1;
00186 if(tvat->size() && it->first==tid)
00187 it->second.push_back(TREE_INSTANCE(0, dfs_id, 0, 1));
00188 else {
00189 typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00190 vec_inst.push_back(TREE_INSTANCE(0, dfs_id, 0, 1));
00191 tvat->push_back(make_pair(tid, vec_inst));
00192 }
00193 delete p;
00194 }
00195 else {
00196
00197 tvat=new VAT;
00198 typename VAT::template ST<TREE_INSTANCE, ALLOC<TREE_INSTANCE> > vec_inst;
00199 vec_inst.push_back(TREE_INSTANCE(0, dfs_id, 0, 1));
00200 tvat->push_back(make_pair(tid, vec_inst));
00201 if(!vat_hmap.add_vat(p, tvat)) {
00202 cerr<<"tokenizer.get_length_one(tree): add_vat failed"<<endl;
00203 return -1;
00204 }
00205 freq_pats.push_back(p);
00206 }
00207
00208
00209 delete[] startline;
00210 return tid;
00211
00212 }
00213
00214 private:
00215 const int MAXLINE;
00216 element_parser<typename TREE_PATTERN::VERTEX_T> el_prsr;
00217
00218 };
00219
00220 #endif