seq_cand_gen.h

00001 /*
00002  *  Copyright (C) 2005 M.J. Zaki <zaki@cs.rpi.edu> Rensselaer Polytechnic Institute
00003  *  Written by parimi@cs.rpi.edu
00004  *  Updated by chaojv@cs.rpi.edu, alhasan@cs.rpi.edu, salems@cs.rpi.edu
00005  *  Modifications:  
00006  *           Changed cand gen for induced mining -- Zaki 7/25/06
00007  * 
00008  *  This program is free software; you can redistribute it and/or
00009  *  modify it under the terms of the GNU General Public License
00010  *  as published by the Free Software Foundation; either version 2
00011  *  of the License, or (at your option) any later version.
00012  *
00013  *  This program is distributed in the hope that it will be useful,
00014  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
00015  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
00016  *  GNU General Public License for more details.
00017  *
00018  *  You should have received a copy of the GNU General Public License along
00019  *  with this program; if not, write to the Free Software Foundation, Inc.,
00020  *  59 Temple Place, Suite 330, Boston, MA 02111-1307, USA.
00021  */
00022 #ifndef _SEQ_CAND_GEN_H_
00023 #define _SEQ_CAND_GEN_H_
00024 
00025 #include "count_support.h"
00026 #include "seq_iso_check.h"
00027 #include "seq_can_code.h"
00028 #include "seq_operators.h"
00029 #include "helper_funs.h"
00030 
00031 extern unsigned long freq_pats_count;
00032 extern bool print;
00033 
00038 template<class PP, class MP, class PAT_ST, template<typename, typename, typename, template <typename> class> class CC, template <typename> class ALLOC, class SM_TYPE>
00039 void cand_gen(const pat_fam<SEQ_PATTERN>& Fk, const pat_fam<SEQ_PATTERN>&, int min_sup, 
00040               pat_fam<SEQ_PATTERN>& freq_pats, count_support<SEQ_PROP, V_Fkk_MINE_PROP, PAT_ST, CC, ALLOC, SM_TYPE >& cs, int depth) {
00041   
00042   typedef pat_fam<SEQ_PATTERN> PAT_FAM;
00043   typedef typename PAT_FAM::CONST_IT PF_CIT;
00044   typedef HASHNS::hash_map<int, int, HASHNS::hash<int> > INT_TO_INT;
00045   
00046   PF_CIT it_i, it_j;
00047   
00048   int sz=Fk.end()-Fk.begin();
00049   PAT_FAM pat_fams[sz]; 
00050   int patfam_count=0;
00051   INT_TO_INT patfam_index;
00052 
00053   for(it_i=Fk.begin(); it_i!=Fk.end(); it_i++) {
00054     bool skip0 = false;
00055     SEQ_PATTERN* pat_i = *it_i;
00056     
00057     //used during induced mining to check if the cand is embedded/induced
00058     //frequent
00059     if (!pat_i->is_freq(min_sup)) skip0 = true; 
00060     
00061     //record this pattern's index in the array of eq classes, if reqd
00062     if(patfam_index.find(pat_i->pat_id())==patfam_index.end())    
00063       if(patfam_index.insert(make_pair(pat_i->pat_id(), patfam_count)).second)
00064         patfam_count++;
00065       else
00066         cerr<<"sequence-cand_gen(): patfam_index.insert failed"<<endl;
00067     
00068     for(it_j=it_i; it_j!=Fk.end(); it_j++) {
00069       bool skip1 = false;
00070       
00071       SEQ_PATTERN* pat_j = *it_j;
00072 
00073       //used during induced mining to check if the cand is embedded/induced
00074       //frequent
00075       if (!pat_j->is_freq(min_sup)) skip1 = true; 
00076 
00077       // if both are embedded then skip these candidates
00078       if (skip0 && skip1) continue;
00079   
00080       if(patfam_index.find(pat_j->pat_id())==patfam_index.end())
00081         if(patfam_index.insert(make_pair(pat_j->pat_id(), patfam_count)).second)
00082           patfam_count++;
00083         else
00084           cout<<"spade.dfs_mine: patfam_index.insert failed"<<endl;
00085       
00086       
00087       int num_cands = 2; // Number of candidates.
00088       
00089       // Generate candidates.
00090       SEQ_PATTERN** cand_pats=join(pat_i, pat_j, skip0, skip1);
00091       
00092       // Perform isomorphism check.
00093       for(int i=0; i<num_cands; i++){
00094         if(cand_pats[i]){
00095           check_isomorphism(cand_pats[i]);
00096         }
00097       }
00098       
00099       
00100       // Count support of candidates
00101       if (cand_pats[0] || cand_pats[1])
00102         cs.count(pat_i, pat_j, cand_pats, min_sup, num_cands);
00103       
00104       //for (int i=0; i < num_cands; i++){
00105       //        if (cand_pats[i]){
00106       //  cout << "COUNT " << pat_i << " ** " << pat_j << " ** " << cand_pats[i] << endl;
00107       //}       
00108       //}
00109       
00110       // Discard infrequent patterns.
00111       for(int i=0; i<num_cands; i++) {
00112         if(!cand_pats[i])
00113           continue;
00114         
00115         if(!cand_pats[i]->is_valid(min_sup)) {
00116           delete cand_pats[i];
00117           cand_pats[i]=0;
00118         } 
00119         else {
00120           int idx=0;
00121           if(i == 0) // P->A is the pattern family.
00122             idx = patfam_index[pat_i->pat_id()];
00123           else if(i == 1) // P->B is the pattern family.
00124             idx = patfam_index[pat_j->pat_id()];
00125           
00126           pat_fams[idx].push_back(cand_pats[i]);
00127           
00128           //cout << "ADD " << cand_pats[i];
00129           if(cand_pats[i]->is_freq(min_sup)) {
00130             // freq_pats.push_back(cand_pats[i]);
00131             if(print){
00132               cout << cand_pats[i];
00133             }
00134             freq_pats_count++;
00135           }
00136         }
00137       }
00138       
00139       delete[] cand_pats;
00140       
00141     }//end for it_j
00142     
00143     // Delete the VAT for it_i. Its never going to be used.
00144     cs.delete_vat(*it_i);
00145     
00146   } //end for it_i
00147   
00148   for(int i=0; i<sz; i++) {
00149     //recursive call
00150     if(!pat_fams[i].empty()) {
00151       // sort patterns
00152       //typename pat_fam<SEQ_PATTERN>::iterator b=pat_fams[i].begin(), e=pat_fams[i].end();
00153       //sort(b, e, less_than<SEQ_PATTERN>());
00154       //cout << "calling " << depth+1 << " -- " << pat_fams[i].size() << endl;
00155       cand_gen(pat_fams[i], pat_fams[i], min_sup, freq_pats, cs, depth+1);
00156     }
00157   }
00158   
00159 } //end cand_gen()
00160 
00161 template<class PP, class MP, class PAT_ST, template<typename, typename, typename, template <typename> class > class CC, template <typename> class ALLOC >
00162   SEQ_PATTERN** join(const SEQ_PATTERN* pat_i, SEQ_PATTERN* pat_j, 
00163                      bool skip0=false, bool skip1 =false) {
00164   
00165   // NOTE: 
00166   // Right now we are just doing the sequence atom JOIN sequence atom.
00167   // Even in this case we are omitting the join that leads to an event atom.
00168   
00169   const typename SEQ_PATTERN::EDGE_T DEF_E_LBL=0;  // Default for edge label.
00170   SEQ_PATTERN** cand_pats=new SEQ_PATTERN*[2];
00171   
00172   // If both patterns are the same then the join results in a single new candidate pattern.
00173   // P->A join P->A = P->A->A (same patterns)
00174   // P->A join P->B = P->A->B (different patterns)
00175   if (skip0){
00176     cand_pats[0] = 0;
00177   }
00178   else{
00179     cand_pats[0] = pat_i->clone();
00180     const typename SEQ_PATTERN::VERTEX_T& new_v=pat_j->rmost_vertex();
00181     int old_rmv = pat_i->rmost_vid();
00182     int new_rmv = cand_pats[0]->add_vertex(new_v);
00183     cand_pats[0]->add_out_edge(old_rmv, new_rmv, DEF_E_LBL);
00184     cand_pats[0]->add_in_edge(new_rmv, old_rmv, DEF_E_LBL);
00185   }
00186   
00187   // If both patterns are the same then the second candidate is not generated.
00188   if(skip1 || pat_i->rmost_vertex() == pat_j->rmost_vertex()) {
00189     cand_pats[1] = 0;
00190   }
00191   else {
00192     // If the two patterns are different, join results in an additional candidate.
00193     // P->B join P->A = P->B->A (Additional pattern)
00194     cand_pats[1] = pat_j->clone();
00195     const typename SEQ_PATTERN::VERTEX_T& new_v=pat_i->rmost_vertex();
00196     int old_rmv = pat_j->rmost_vid();
00197     int new_rmv = cand_pats[1]->add_vertex(new_v);
00198     cand_pats[1]->add_out_edge(old_rmv, new_rmv, DEF_E_LBL);
00199     cand_pats[1]->add_in_edge(new_rmv, old_rmv, DEF_E_LBL);
00200   }
00201   
00202   return cand_pats;
00203 }
00204 
00205 #endif

Generated on Wed Jul 26 14:01:08 2006 for DMTL by  doxygen 1.4.7