Main Page | Namespace List | Class Hierarchy | Class List | Directories | File List | Namespace Members | Class Members | File Members

main-apriori-simple.cpp

Go to the documentation of this file.
00001 
00008 #include "common.hpp"
00009 #include "common/log.h"
00010 #include "common/allocators.hpp"
00011 #include "io/input/transaction_reader/LBufferedTransactionReader.hpp"
00012 #include "io/input/transaction_reader/SortedTransactionReader.hpp" 
00013 
00014 #include "io/codec/coder/Coder.hpp"
00015 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00016 //#include "io/output/StatOutput.hpp"
00017 //#include "io/output/normal/SortOutput.hpp"
00018 
00019 #include "io/db_cache/RBTreeDBCache.hpp"
00020 
00021 #include "util/StreamParser.hpp"
00022 #include "util/FrequentFilter.cpp"
00023 #include "util/Frequent2Filter.cpp"
00024 
00025 #include "apriori/bodon/Trie.hpp"
00026 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00027 
00028 #include "apriori/bodon/trie/trie_manipulators/FrequentItemInserter.hpp"
00029 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserter.hpp"
00030 #include "apriori/bodon/trie/trie_manipulators/support_counter/SupportCounterMerge.hpp"
00031 #include "apriori/OneByOneSupportCounter.hpp"
00032 #include "apriori/bodon/trie/trie_manipulators/SimplePruner.hpp"
00033 #include "apriori/bodon/trie/trie_manipulators/CandidateGeneratorPrune.hpp"
00034 #include "apriori/bodon/trie/trie_manipulators/InfreqRemover.hpp"
00035 
00036 #include "apriori/Apriori.hpp"
00037 
00038 #include <vector>
00039 #include <iostream>
00040 #include <string>
00041 
00042 
00043 std::string file_format;
00044 
00045 void init()
00046 {
00047    file_format = "File format:";
00048    file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00049    file_format += "represents a transaction. \n";
00050    file_format += "A transaction is a set of items seperated by a nonnumeric ";
00051    file_format += "character.\nIt can be for example a white space, comma, ";
00052    file_format += "colon, etc.\n";
00053    file_format += "Items are nonnegative integers.\n";
00054 }
00056 void usage()
00057 {
00058    std::cerr<<"\nUsage: apriori-simple transactionfile min_supp outcomefile [maxsize]\n";
00059    std::cerr<<" transactionfile\t    file, that contains the tranasctions of items\n";
00060    std::cerr<<" outcomefile\t    file to write the outcome\n";
00061    std::cerr<<" min_supp\t    absolute support threshold\n";
00062    std::cerr<<" maxsize\t    the upper limit of the size of the frequent sets\n";
00063 
00064    std::cerr << file_format;
00065    std::cerr<<"\n\t\t\tHave a succesful mining ;-)\n\n";
00066 }
00067 
00078 int process_arguments( int argc, char *argv[], counter_t& min_supp, 
00079                        bool &isrel, double &relminsupp, unsigned int& maxsize )
00080 {
00081    if ( argc < 4 )
00082    {
00083      log_err(0,"There are 3 mandatory arguments.");
00084      usage();
00085      return 2;
00086    }
00087    std::string mins=argv[2];
00088    if (mins[mins.size()-1]=='%') {
00089      mins.erase(mins.size()-1);
00090      isrel=true;
00091      relminsupp=atof(mins.c_str());
00092      relminsupp/=100;
00093      log_info(0,"Using relative minimum support of %lg",relminsupp);
00094      return 0;
00095    }
00096    isrel=false;
00097    int min_supp_i;
00098    try
00099    {
00100       convert(argv[2], min_supp_i);
00101       if ( min_supp_i <= 0  )
00102       {
00103          log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00104          return 3;
00105       }
00106    }
00107    catch(BadConversion e)
00108    {
00109       log_err(0,"min_supp conversion problem.");
00110       return 3;
00111    }
00112    min_supp = static_cast<counter_t>(min_supp_i);
00113    log_info(0,"min_supp is set to %d", min_supp);
00114    if(argc == 5)
00115    {
00116       int maxsize_i;
00117       try
00118       {
00119          convert(argv[4], maxsize_i);
00120          if ( maxsize_i <= 0  )
00121          {
00122             log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00123             return 4;
00124          }
00125       }
00126       catch(BadConversion e)
00127       {
00128          log_err(0,"max_size conversion problem.");
00129          return 4;
00130       }
00131       maxsize = static_cast<unsigned int>(maxsize_i);
00132       log_status(0,"maxsize is set to %d", maxsize);
00133    }
00134    else
00135       maxsize = largest_itemsetsize;
00136    return 0;
00137 }
00138 
00151 int main( int argc, char *argv[] )
00152 {
00153    init();
00154    counter_t min_supp;
00155    unsigned int maxsize;
00156    bool relative;
00157    double relminsupp;
00158    
00159    {
00160       int return_val = 
00161          process_arguments( argc, argv, min_supp, relative, relminsupp, maxsize );
00162       if(return_val)
00163          return return_val;
00164    }
00165 
00166    char* input_file = argv[1];
00167    char* output_file = argv[3];
00168    
00169    try
00170    {
00171       // We assume that the transactions does not contain duplicates!!!
00172       typedef LBufferedTransactionReader< > T_R;
00173       // Otherwise uncomment this:
00174       // typedef SortedTransactionReader<LBufferedTransactionReader< >, true> T_R;
00175       
00176       T_R::params_t par_i;
00177       par_i.file_name = input_file;
00178       par_i.mode=FileReprBase::READ;
00179       par_i.file_buffer_size = 16 * 1024;
00180       T_R tr_reader(&par_i);
00181       std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00182       counter_t nr_of_transactions;
00183       // The first step of each algorithms is determining the frequent items.
00184       FrequentFilter<T_R>
00185          fr_filter(tr_reader);
00186       log_status(0,"Finding frequent items.");
00187       fr_filter.findFrequentItems( freq_items_with_counters,  
00188                                    nr_of_transactions, min_supp);
00189       
00190       if(!freq_items_with_counters.empty())
00191       {
00192          log_status(0,"Doing decoder.");
00193          typedef SimpleDFDecoder< > DF_D;
00194 //          typedef SimpleDFDecoder<SortOutput<> > DF_D;
00195 //          typedef StatOutput<SimpleDFDecoder< >, true> > DF_D;
00196 
00197          DF_D::params_t par_d;
00198          par_d.file_name = output_file;
00199          par_d.mode=FileReprBase::WRITE;
00200 //          par_d.numfreq = freq_items_with_counters.size(); // If StatOutput is used!!!
00201          DF_D df_decoder(&par_d);
00202          
00203          typedef Bodon::Leaf LEAF;       
00204          typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00205          typedef SortedTransactionReader< Coder<T_R, DF_D> > S_C_T_R;
00206          typedef Bodon::RBTreeDBCache<S_C_T_R, std::vector<item_t> > S_C;
00207          S_C::params_t par_c;
00208          par_c.file_name = input_file;
00209          par_c.mode=FileReprBase::READ;
00210          par_c.largest_item = tr_reader.getLargestItem();
00211          par_c.decoder = &df_decoder;
00212          par_c.freq_items_with_counters = &freq_items_with_counters;
00213          par_c.codemode = ASC;
00214          log_status(0,"Doing sorted codec.");
00215          S_C sorted_coder(&par_c);
00216 
00217          std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00218             freq_pairs_with_counters;
00219          Frequent2Filter<S_C> fr_2_filter(
00220             &sorted_coder );
00221 //   Frequent2FilterOnline<S_C> fr_2_filter(
00222 //      &sorted_coder );
00223          log_status(0,"Finding frequent pairs.")
00224             fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00225 
00226          const NEELevel NEE =  NEE_Off;
00227          typedef NewWrapperAlloc<TRIE> TRIE_ALLOCATOR;
00228          typedef Bodon::FrequentItemInserter<DF_D, TRIE, NEE> FII;
00229          typedef Bodon::SupportCounterMerge<TRIE> SUPP_C_BASE;
00230          typedef OneByOneSupportCounter<TRIE, S_C, SUPP_C_BASE> SUPP_C;
00231 
00232          TRIE main_trie;
00233          TRIE_ALLOCATOR s_alloc;
00234          typedef Bodon::FrequentPairInserter<DF_D, TRIE, TRIE, TRIE_ALLOCATOR, NEE> FPI;
00235          typedef Bodon::trie::SimplePruner<DF_D, TRIE, NewWrapperAlloc<TRIE>, NEE> PRUNER;
00236          typedef Bodon::CandidateGeneratorPrune<PRUNER, DF_D, TRIE, TRIE_ALLOCATOR, NEE> CG;
00237          typedef Bodon::trie::InfreqRemover<DF_D, TRIE, TRIE_ALLOCATOR, NEE> IR;
00238          IR infrequent_remover(main_trie, df_decoder, s_alloc);
00239          typedef Apriori<S_C, DF_D, TRIE, TRIE_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00240 
00241          FII fii(main_trie, df_decoder);
00242          A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00243          log_status(0,"Finding frequent itemsets.")
00244             apriori.findFrequentItemsets( 
00245                nr_of_transactions, *par_c.freq_counters,
00246                freq_pairs_with_counters, min_supp, maxsize );
00247       }
00248    }
00249    catch (std::ios_base::failure e)
00250    {
00251       log_err(0,"Exiting the program due to IO exception");
00252       return 1;
00253    }
00254 }
00255 
00256 

Generated on Sun Sep 17 17:50:39 2006 for FIM environment by  doxygen 1.4.4