Main Page | Namespace List | Class Hierarchy | Class List | Directories | File List | Namespace Members | Class Members | File Members

eclat.hpp

Go to the documentation of this file.
00001 
00006 #ifndef __ECLAT_HPP_
00007 #define __ECLAT_HPP_ t
00008 
00009 // feature flags:
00010 
00011 
00012 #ifdef ECLAT_DEFECT
00013 static const bool EXTENSION_DESCRIPTION_DEFECTS = true; // false means: COVERS
00014 static const bool OMIT_FINAL_INCIDENCE_MATRIX = true;
00015 static const bool PRUNE_EQUISUPPORT_EXTENSIONS = true;
00016 static const bool EARLY_FILTERING = true;
00017 //static const bool EARLY_FILTERING = false;
00018 static const bool INTERSECTION_ROWSWAPPING = false; // false seems to give better results.
00019 #endif
00020 
00021 #ifdef ECLAT_COVER
00022 static const bool EXTENSION_DESCRIPTION_DEFECTS = false; // false means: COVERS
00023 static const bool OMIT_FINAL_INCIDENCE_MATRIX = true;
00024 static const bool PRUNE_EQUISUPPORT_EXTENSIONS = true;
00025 static const bool EARLY_FILTERING = true;
00026 //static const bool EARLY_FILTERING = false;
00027 static const bool INTERSECTION_ROWSWAPPING = false; // false seems to give better results.
00028 #endif
00029 
00030 
00031 // #include "SparseBitmatrix.hpp"
00032 #include "SparseBitmatrix_create.hpp"
00033 #include "SparseBitmatrix_setdifference.hpp"
00034 #include "SparseBitmatrix_intersection.hpp"
00035 //#include "SparseBitmatrix_intersection_membervariables-v2.hpp"
00036 
00037 #include "common/debug.hpp"
00038 #include "io/input/transaction_reader/TransactionReader.hpp"
00039 #include "common.hpp"
00040 
00041 #include <vector>
00042 
00043 // debugging options:
00044 static const bool OUTPUT_HINTS = false;
00045 
00046 
00047 
00048 template <class TRANSACTION_READER, class PATTERN_WRITER>
00049 class Eclat 
00050 {
00051 public:
00052   Eclat(TRANSACTION_READER& tdb, CounterItemPairs& freqItems, int minSup, int numTransactions, PATTERN_WRITER& out) :
00053     tdb(tdb), freqItems(freqItems), minSup(minSup), numTransactions(numTransactions), out(out)
00054   {
00055     MAX_PATTERN_LENGTH = 1000; // FIXME: should be numItems for sets.
00056     ptis = new (SparseBitmatrix*)[MAX_PATTERN_LENGTH];
00057     eeIterator = new typename PATTERN_WRITER::EquisupportItemOutputIterator(out);
00058   }
00059   ~Eclat() {};
00060 
00061   void findFrequentPatterns();
00062   
00063 private:
00064   TRANSACTION_READER& tdb;
00065   CounterItemPairs& freqItems;
00066   int minSup;
00067   int numTransactions;
00068   PATTERN_WRITER& out;
00069 
00073   typename PATTERN_WRITER::EquisupportItemOutputIterator* eeIterator;
00074 
00075   // int numItems;
00076   int MAX_PATTERN_LENGTH;
00077 
00081   int depth;
00082 
00086   int maxdefect;
00087 
00091   SparseBitmatrix** ptis;
00092 
00096   void checkUpToTwoRows(SparseBitmatrix* iti, counter_t minSup, counter_t maxdefect);
00097 
00101   void depthFirstWalk();
00102 
00103 
00104 };
00105 
00106 
00107 template <class TRANSACTION_READER, class PATTERN_WRITER>
00108 void Eclat<TRANSACTION_READER, PATTERN_WRITER>::findFrequentPatterns() {
00109   // 0. write empty pattern
00110   if (numTransactions >= minSup)
00111     out.write(numTransactions);
00112   
00113   // 1. build item/transaction incidence matrix:
00114   std::cout << "build item/transaction incidence matrix..." << std::flush;
00115   SparseBitmatrix* iti = createSparseItemTransactionIncidenceMatrix(tdb, freqItems);
00116   int numItems = freqItems.size();
00117   std::cout << " - done" << std::endl;
00118 
00119 #if DEBUG_LEVEL >= 20  
00120   std::cout << "iti:" << std::endl;
00121   iti->dump();
00122 #endif
00123 
00124 #if DEBUG_LEVEL >= 20  
00125   std::cout << "item freqs: ";
00126   for(CounterItemPairs::iterator it_freqItems = freqItems.begin(); it_freqItems != freqItems.end(); ++it_freqItems )
00127     std::cout << it_freqItems->first << " "; 
00128   std::cout << endl;
00129 #endif
00130 
00131   // if we do COVERS and not DEFECTS
00132   if (! EXTENSION_DESCRIPTION_DEFECTS) { // COVERS:
00133     std::cout << "compute covers..." << std::flush;
00134     for (int i = 0; i < MAX_PATTERN_LENGTH; i++)
00135       ptis[i] = NULL;
00136     ptis[0] = iti;
00137     depth = 0;
00138     depthFirstWalk();
00139     std::cout << " - done" << std::endl;
00140     return;
00141   }
00142 
00143   // 2. allocate space:
00144   int rowCapacityIti2 = iti->computeMaxRowLength();
00145   if (EARLY_FILTERING) 
00146     rowCapacityIti2 -= minSup;
00147   SparseBitmatrix* iti2 = new SparseBitmatrix(numItems-1, rowCapacityIti2); // This will eat all your memory, although it is optimal.
00148   depth = 1;
00149   // for (int i = 0; i < MAX_PATTERN_LENGTH; i++)
00150   //   ptis[i] = NULL;
00151   ptis[0] = iti;
00152   ptis[1] = iti2;
00153 
00154   // 3. compute set differences of all row pairs:
00155   std::cout << "compute defects..." << std::flush;
00156   // int* itiRowLengths = iti->rowLengths;
00157   // int* itiRowLabels = iti->rowLabels;
00158   for (int i = 0; i < numItems-1; i++) {
00159     // b. output 1-itemsets:
00160     assert(i >= 0 && i < iti->nRows());
00161     out.pushItem(iti->rowLabel(i)); // itiRowLabels[i]);
00162     int itiRowLengthi = iti->rowLength(i); // itiRowLengths[i];
00163     // c. compute 2-itemsets via initial set differences and walk:
00164     maxdefect = itiRowLengthi - minSup;
00165     int numPairs = setdifferenceOfSparseBitmatrix<EARLY_FILTERING,PRUNE_EQUISUPPORT_EXTENSIONS>(*iti, i, *iti2, maxdefect, *eeIterator);
00166 
00167     if (numPairs > 0)
00168       depthFirstWalk();
00169     
00170     out.write(itiRowLengthi);
00171     out.popItem();
00172   }
00173   // d. output last 1-itemsets:
00174   out.pushItem(iti->rowLabel(numItems-1)); // itiRowLabels[numItems-1]);
00175   out.write(iti->rowLength(numItems-1)); // itiRowLengths[numItems-1]); 
00176   std::cout << " - done" << std::endl;
00177 }
00178 
00179 
00180 
00181 template <class TRANSACTION_READER, class PATTERN_WRITER>
00182 void Eclat<TRANSACTION_READER, PATTERN_WRITER>::checkUpToTwoRows(SparseBitmatrix* iti, 
00183                                                                  counter_t minSup, counter_t maxdefect) {
00184   assert(iti->nRows() <= 2);
00185 
00186   // 1. record pattern for row[0]:
00187   out.pushItem(iti->rowLabel(0));
00188   if (OUTPUT_HINTS) out.hint(string("a"));
00189   out.write(EXTENSION_DESCRIPTION_DEFECTS  
00190             ? minSup+maxdefect-iti->rowLength(0)
00191             : iti->rowLength(0)); 
00192   out.popItem();
00193   
00194   if (iti->nRows() == 2) {
00195     // 2. record pattern for row[1]:
00196     out.pushItem(iti->rowLabel(1)); 
00197     if (OUTPUT_HINTS) out.hint(string("b"));
00198     out.write(EXTENSION_DESCRIPTION_DEFECTS  
00199               ? minSup+maxdefect-iti->rowLength(1)
00200               : iti->rowLength(1)); 
00201     out.popItem();
00202 
00203     // 3. compute |row1 \ row0|
00204     int maxdef0 = maxdefect - iti->rowLength(0);
00205     int rowlen01 = EXTENSION_DESCRIPTION_DEFECTS
00206       ? cardinalitySetdifferenceBySparseBitmatrix_TwoRows<EARLY_FILTERING>(*iti, maxdef0)
00207       : cardinalityIntersectionSparseBitmatrix_TwoRows<EARLY_FILTERING>(*iti, minSup);
00208     if ((EXTENSION_DESCRIPTION_DEFECTS 
00209          ? rowlen01 <= maxdef0
00210          : rowlen01 >= minSup)) {
00211       // 4. record pattern for row[0]+row[1]:
00212       out.pushItem(iti->rowLabel(0));
00213       out.pushItem(iti->rowLabel(1)); 
00214       if (OUTPUT_HINTS) out.hint(string("c"));
00215       out.write(EXTENSION_DESCRIPTION_DEFECTS  
00216                 ? minSup+maxdef0-rowlen01
00217                 : rowlen01);
00218       out.popItem();
00219       out.popItem();
00220     }
00221   }
00222 }
00223 
00224 template <class TRANSACTION_READER, class PATTERN_WRITER>
00225 void Eclat<TRANSACTION_READER, PATTERN_WRITER>::depthFirstWalk() {
00226 #if DEBUG_LEVEL >= 10
00227   cout << endl
00228        << out.getCurrentPatternAsString() << " "
00229        << " (maxdefect=" << maxdefect << ", depth = " << depth <<", nrows = " << ptis[depth]->nRows() << "):" << endl;
00230 #endif
00231 #if DEBUG_LEVEL >= 30
00232   ptis[depth]->dump();
00233 #endif
00234 
00235   // 1. allocate new bit matrix if necessary:
00236   assert(ptis[depth] != NULL);
00237   SparseBitmatrix* pti = ptis[depth];
00238   
00239   SparseBitmatrix* bmOut;
00240   if (OMIT_FINAL_INCIDENCE_MATRIX || pti->nRows() >= 2) { // (ALLOC_ITI_PRE_LOCAL)
00241     if (EXTENSION_DESCRIPTION_DEFECTS) {
00242       int rowCapacity = pti->maxRowLength(); // second-largest would do, but as rowlengths are not guaranteed to decrease this in general is *NOT* pti->rowLengths[1]
00243       if (EARLY_FILTERING && maxdefect < rowCapacity)
00244         rowCapacity = maxdefect; // [lst 2005/05/04] fixed now. was +1 as otherwise we get SEGFAULTS: setdifference seemed to go one beyond what is necessary.
00245       bmOut = new SparseBitmatrix(pti->nRows()-1, rowCapacity);
00246     } else { // if (EXTENSION_DESCRIPTION_COVERS)
00247       bmOut = ptis[depth+1];
00248       if (bmOut == NULL)
00249         bmOut = new SparseBitmatrix(ptis[0]->nRows()-depth-1, &(ptis[0]->capacities()[depth+1]));
00250       // bmOut = new SparseBitmatrix(ptis[0]->nRows()-depth-1, &(ptis[0]->capacity(depth+1)));
00251     // bmOut = new SparseBitmatrix(pti->numRows-1, pti->maxRowLength);
00252     }
00253     ptis[depth+1] = bmOut;
00254   } else
00255     bmOut = NULL;
00256 
00257   int numSiblings = pti->nRows();
00258   
00259   // 2. walk children:
00260   depth++;
00261   for (int i = 0; i < numSiblings-1; i++) {
00262     assert(i < pti->capacity());
00263     // a. add item to pattern:
00264     out.pushItem(pti->rowLabel(i));
00265     
00266     // b. compute intersection:
00267     int numSiblingsNext; 
00268     if (EXTENSION_DESCRIPTION_DEFECTS) {
00269       numSiblingsNext = setdifferenceBySparseBitmatrix<EARLY_FILTERING,PRUNE_EQUISUPPORT_EXTENSIONS>(*pti, i, *bmOut, maxdefect-pti->rowLength(i), *eeIterator);
00270       if (numSiblingsNext > 0) {
00271         maxdefect -= pti->rowLength(i);
00272         if (OMIT_FINAL_INCIDENCE_MATRIX && numSiblingsNext <= 2)
00273           checkUpToTwoRows(bmOut, minSup, maxdefect);
00274         else
00275           depthFirstWalk(); 
00276         maxdefect += pti->rowLength(i);
00277       }
00278     } else { // if (EXTENSION_DESCRIPTION_COVERS)
00279       numSiblingsNext = intersectSparseBitmatrix<EARLY_FILTERING, INTERSECTION_ROWSWAPPING, PRUNE_EQUISUPPORT_EXTENSIONS>(*pti, i, *bmOut, minSup, *eeIterator);
00280       if (numSiblingsNext > 0) 
00281         depthFirstWalk(); 
00282     }
00283     
00284     // c. record base pattern:
00285     assert(i >= 0 && i < pti->numRows);
00286     if (OUTPUT_HINTS) out.hint(string("d"));
00287     out.write(EXTENSION_DESCRIPTION_DEFECTS  
00288               ? minSup+maxdefect-pti->rowLength(i)
00289               : pti->rowLength(i)); 
00290     out.popItem();
00291   }
00292   
00293   // 3. write last pattern:
00294   int i = numSiblings-1;
00295   out.pushItem(pti->rowLabel(i)); 
00296   if (OUTPUT_HINTS) out.hint(string("e"));
00297     out.write(EXTENSION_DESCRIPTION_DEFECTS 
00298               ? minSup+maxdefect-pti->rowLength(i)
00299               : pti->rowLength(i)); 
00300   out.popItem();
00301 
00302   // 4. delete old bit matrix (ALLOC_ITI_PRE_LOCAL)
00303   if (EXTENSION_DESCRIPTION_DEFECTS && bmOut != NULL) { // for COVERS we have ALLOC_ITI_PRE_GLOBAL
00304     delete bmOut;  // = ptis[depth];
00305     ptis[depth] = NULL;
00306   }
00307   
00308   depth--;
00309 }
00310 
00311 
00312 #endif

Generated on Sun Sep 17 17:50:38 2006 for FIM environment by  doxygen 1.4.4