Main Page | Namespace List | Class Hierarchy | Class List | Directories | File List | Namespace Members | Class Members | File Members

test/apriori/bodon-vector/main.cpp

Go to the documentation of this file.
00001 
00002 
00003 #include "common.hpp"
00004 #include "common/log.h"
00005 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00006 #include "io/input/transaction_reader/SortedTransactionReader.hpp" 
00007 
00008 
00009 #include "io/codec/decoder/normal/CacheNormalDecoder.hpp"
00010 //#include "io/output/StatOutput.hpp"
00011 #include "io/output/normal/SortOutput.hpp"
00012 
00013 #include "util/StreamParser.hpp"
00014 #include "util/FrequentFilter.cpp"
00015 
00016 #include "datastructures/maxvector.hpp"
00017 
00018 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00019 #include "io/input/transaction_reader/OrderReverser.hpp"
00020 #include "io/codec/coder/Coder.hpp"
00021 #include "io/db_cache/BuildTreeDBCache.hpp"
00022 #include "util/Frequent2Filter.cpp"
00023 #include "util/Frequent2FilterOnline.cpp"
00024 #include "apriori/bodon-vector/vector_typedef.hpp"
00025 #include "apriori/OneByOneSupportCounter.hpp"
00026 #include "apriori/bodon-vector/SupportCounter.hpp"
00027 #include "apriori/bodon-vector/CandidateGeneratorNoprune.hpp"
00028 #include "apriori/bodon-vector/CandidateGeneratorSimplePrune.hpp"
00029 #include "apriori/bodon-vector/InfrequentRemover.hpp"
00030 #include "apriori/bodon-vector/FrequentItemInserter.hpp"
00031 #include "apriori/bodon-vector/FrequentPairInserterNoprune.hpp"
00032 
00033 #include "datastructures/PrefixArrayList.hpp"
00034 #include "apriori/bodon-prefixarray/SupportCounter.hpp"
00035 #include "apriori/bodon-prefixarray/CandidateGeneratorNoprune.hpp"
00036 #include "apriori/bodon-prefixarray/InfrequentRemover.hpp"
00037 #include "apriori/bodon-prefixarray/FrequentPairInserterNoprune.hpp"
00038 
00039 
00040 
00041 #include "apriori/Apriori.hpp"
00042 
00043 #include <vector>
00044 #include <iostream>
00045 #include <string>
00046 
00047 
00048 std::string file_format;
00049 
00050 void init()
00051 {
00052    file_format = "File format:";
00053    file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00054    file_format += "represents a transaction. \n";
00055    file_format += "A transaction is a set of items seperated by a nonnumeric ";
00056    file_format += "character.\nIt can be for example a white space, comma, ";
00057    file_format += "colon, etc.\n";
00058    file_format += "Items are nonnegative integers.\n";
00059 }
00061 void usage()
00062 {
00063    log_err(0,"Usage: apriori-nontrie transactionfile min_supp outcomefile data_structure [maxsize]");
00064    log_err(0," transactionfile\t    file, that contains the tranasctions of items");
00065    log_err(0," outcomefile\t    file to write the outcome");
00066    log_err(0," min_supp\t    absolute support threshold");
00067    log_err(0," data_structure\t    data structure to store the candidates: i.e vector, prefixarray");
00068 
00069    std::cerr << file_format;
00070    log_err(0,"\t\t\tHave a succesful mining ;-)\n\n");
00071 }
00072 
00084 int process_arguments( int argc, char *argv[], counter_t& min_supp, 
00085                        bool &isrel, double &relminsupp, unsigned int& maxsize )
00086 {
00087    if( argc < 5 )
00088    {
00089      log_err(0,"There are 4 mandatory arguments.");
00090      return 2;
00091    }
00092    
00093    std::string mins=argv[2];
00094    if (mins[mins.size()-1]=='%') {
00095      mins.erase(mins.size()-1);
00096      isrel=true;
00097      relminsupp=atof(mins.c_str());
00098      relminsupp/=100;
00099      log_status(0,"Using relative minimum support of %lg",relminsupp);
00100      return 0;
00101    }
00102    isrel=false;
00103    int min_supp_i;
00104    try
00105    {
00106       convert(argv[2], min_supp_i);
00107       if ( min_supp_i <= 0  )
00108       {
00109          log_err(0,"%s cannot be converted to a positive integer.",argv[2]);
00110          return 3;
00111       }
00112    }
00113    catch(BadConversion e)
00114    {
00115       log_err(0,"min_supp conversion problem.");
00116       return 3;
00117    }
00118    min_supp = static_cast<counter_t>(min_supp_i);
00119    log_status(0,"min_supp is set to %d", min_supp);
00120 
00121    if(argc == 6)
00122    {
00123       int maxsize_i;
00124       try
00125       {
00126          convert(argv[5], maxsize_i);
00127          if ( maxsize_i <= 0  )
00128          {
00129             log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00130             return 4;
00131          }
00132       }
00133       catch(BadConversion e)
00134       {
00135          log_err(0,"min_supp conversion problem.");
00136          return 4;
00137       }
00138       maxsize = static_cast<unsigned int>(maxsize_i);
00139       log_status(0,"maxsize is set to %d", maxsize);
00140    }
00141    else
00142       maxsize = largest_itemsetsize;
00143    return 0;
00144 }
00145 
00146 int main( int argc, char *argv[] )
00147 {
00148    init();
00149    counter_t min_supp;
00150    unsigned int maxsize;
00151    bool relative;
00152    double relminsupp;
00153       
00154    {
00155       int return_val = 
00156          process_arguments( argc, argv, min_supp, 
00157                             relative, relminsupp, maxsize );
00158       if(return_val)
00159          return return_val;
00160    }
00161 
00162    char* input_file = argv[1];
00163    char* output_file = argv[3];
00164 
00165    try
00166    {
00167       // We assume that the transactions does not contain duplicates!!!
00168       typedef brBufferedTransactionReader< > T_R;
00169       // Otherwise uncomment this:
00170       // typedef SortedTransactionReader<brBufferedTransactionReader< >, true> T_R;
00171 
00172       T_R::params_t par_i;
00173       par_i.file_name = input_file;
00174       par_i.mode=FileReprBase::READ;
00175       par_i.file_buffer_size = 16 * 1024;
00176       T_R tr_reader(&par_i);
00177       std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00178       counter_t nr_of_transactions;
00179       // The first step of each algorithms is determining the frequent items.
00180       FrequentFilter<T_R>
00181          fr_filter(tr_reader);
00182       log_status(0,"Finding frequent items.");
00183       fr_filter.findFrequentItems( freq_items_with_counters,  
00184                                    nr_of_transactions, min_supp);
00185 
00186       if(!freq_items_with_counters.empty())
00187       {
00188          log_status(0,"Doing decoder.");
00189          typedef CacheNormalDecoder< > D;
00190 
00191          D::params_t par_d;
00192          par_d.file_name = output_file;
00193          par_d.mode=FileReprBase::WRITE;
00194 //          par_d.numfreq = freq_items_with_counters.size(); // If StatOutput is used!!!
00195          D decoder(&par_d);
00196 
00197 // ascending order (also change par_c.codemode)
00198          typedef SortedTransactionReader< Coder<T_R, D>, false, false > S_C_T_R;
00199          typedef OrderReverser< bracz::BuildTreeDBCache< 
00200             S_C_T_R, std::vector<item_t>, bracz::EndPatriciaBuildTree<true> > >S_C;
00201 //   no database caching:
00202 //       typedef SortedTransactionReader<Coder<T_R, D> >  S_C;
00203          typedef Frequent2Filter<S_C> F2F;
00204 
00205          S_C::params_t par_c;
00206          par_c.file_name = input_file;
00207          par_c.mode=FileReprBase::READ;
00208          par_c.largest_item = tr_reader.getLargestItem();
00209          par_c.decoder = &decoder;
00210          par_c.freq_items_with_counters = &freq_items_with_counters;
00211          par_c.codemode = ASC;
00212 //   par_c.codemode = DESC;
00213          log_status(0,"Doing sorted codec.");
00214          S_C sorted_coder(&par_c);
00215          std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00216             freq_pairs_with_counters;
00217          F2F fr_2_filter(&sorted_coder );
00218          log_status(0,"Finding frequent pairs.")
00219             fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00220          typedef bool DUMMY;
00221          DUMMY dummy;
00222          if( strstr(argv[4],"vector") )
00223          {
00224             log_status(0,"Candidates are stored in a vector");
00225             typedef cand_vector_t DATA_STRUCTURE;
00226             DATA_STRUCTURE candidates;
00227             typedef Bodon::vector_based::FrequentItemInserter<DATA_STRUCTURE, D> FII;
00228             FII fii(candidates, decoder);
00229             typedef Bodon::vector_based::SupportCounter<DATA_STRUCTURE> SUPP_C_BASE;
00230             typedef OneByOneSupportCounter<DATA_STRUCTURE, S_C, SUPP_C_BASE> SUPP_C;
00231 
00232             typedef Bodon::vector_based::FrequentPairInserterNoprune<D, DUMMY> FPI;
00233 //          typedef Bodon::vector_based::CandidateGeneratorNoprune<D, DUMMY> CG;
00234             typedef Bodon::vector_based::CandidateGeneratorSimplePrune<D, DUMMY> CG;
00235             typedef Bodon::vector_based::InfrequentRemover<D, DUMMY> IR;
00236             IR infrequent_remover(candidates, decoder, dummy);
00237             typedef Apriori<S_C, D, DATA_STRUCTURE, DUMMY, FII, 
00238                FPI, CG, IR, SUPP_C> A;
00239             A apriori(candidates, dummy, infrequent_remover, sorted_coder, decoder, fii);
00240             log_status(0,"Finding frequent itemsets.")
00241                apriori.findFrequentItemsets( 
00242                   nr_of_transactions, *par_c.freq_counters,
00243                   freq_pairs_with_counters, min_supp, maxsize );
00244          }
00245          else
00246          {
00247             log_status(0,"Candidates are stored in a prefix-array.");
00248             typedef PrefixArrayList<> DATA_STRUCTURE;
00249             DATA_STRUCTURE candidates(freq_items_with_counters.size()-1);
00250             typedef Bodon::vector_based::FrequentItemInserter<DATA_STRUCTURE, D> FII;
00251             FII fii(candidates, decoder);
00252             typedef Bodon::prefix_array::SupportCounter<DATA_STRUCTURE> SUPP_C_BASE;
00253             typedef OneByOneSupportCounter<DATA_STRUCTURE, S_C, SUPP_C_BASE> SUPP_C;
00254             typedef Bodon::prefix_array::FrequentPairInserterNoprune<DATA_STRUCTURE, D, DUMMY> FPI;
00255             typedef Bodon::prefix_array::CandidateGeneratorNoprune<DATA_STRUCTURE, D, DUMMY> CG;
00256             typedef Bodon::prefix_array::InfrequentRemover<DATA_STRUCTURE, D, DUMMY> IR;
00257             IR infrequent_remover(candidates, decoder, dummy);
00258             typedef Apriori<S_C, D, DATA_STRUCTURE, DUMMY, FII, 
00259                FPI, CG, IR, SUPP_C> A;
00260             A apriori(candidates, dummy, infrequent_remover, sorted_coder, decoder, fii);
00261             log_status(0,"Finding frequent itemsets.")
00262                apriori.findFrequentItemsets( 
00263                   nr_of_transactions, *par_c.freq_counters,
00264                   freq_pairs_with_counters, min_supp, maxsize );
00265          }
00266       }
00267    }
00268    catch (std::ios_base::failure e)
00269    {
00270       log_err(0,"Exiting the program due to IO exception");
00271       return 1;
00272    }
00273 }
00274 
00275 

Generated on Sun Sep 17 17:50:39 2006 for FIM environment by  doxygen 1.4.4