Main Page | Namespace List | Class Hierarchy | Class List | Directories | File List | Namespace Members | Class Members | File Members

deadend.cpp

Go to the documentation of this file.
00001 #include "common.hpp"
00002 #include "common/log.h"
00003 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00004 #include "io/input/transaction_reader/SortedTransactionReader.hpp" 
00005 
00006 
00007 #include "io/codec/decoder/df/SimpleDFDecoder.hpp"
00008 //#include "io/codec/decoder/df/CacheDFDecoder.hpp"
00009 #include "io/codec/decoder/df/DFDecoderWithEEManagement.hpp"
00010 //#include "io/output/StatOutput.hpp"
00011 #include "io/output/normal/SortOutput.hpp"
00012 //#include "io/codec/decoder/df/DFDecoderWrapper.hpp"
00013 
00014 #include "util/StreamParser.hpp"
00015 #include "util/FrequentFilter.cpp"
00016 
00017 #include "datastructures/maxvector.hpp"
00018 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00019 //#include "datastructures/trie/edgelist/OrderedEdgelistDynLookup.hpp"
00020 #include "datastructures/trie/edgelist/OffsetIndexVector.hpp"
00021 #include "common/allocators.hpp"
00022 
00023 #include "apriori/bodon/Leaf.hpp"
00024 #include "apriori/bodon/Trie.hpp"
00025 #include "apriori/bodon/TrieNEE.hpp"
00026 #include "io/input/transaction_reader/SortedTransactionReader.hpp"
00027 #include "io/input/transaction_reader/OrderReverser.hpp"
00028 #include "io/codec/coder/Coder.hpp"
00029 #include "util/Frequent2Filter.cpp"
00030 #include "util/Frequent2FilterOnline.cpp"
00031 #include "apriori/bodon/trie/trie_manipulators/FrequentItemInserter.hpp"
00032 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserter.hpp"
00033 #include "apriori/bodon/trie/trie_manipulators/FrequentPairInserterNoprune.hpp"
00034 
00035 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/SimplePruner.hpp"
00036 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/CandidateGeneratorPrune.hpp"
00037 #include "apriori/bodon/inhomogeneous_trie/trie_manipulators/InfreqRemover.hpp"
00038 
00039 #include "apriori/OneByOneSupportCounter.hpp"
00040 #include "apriori/bodon/trie/trie_manipulators/support_counter/SupportCounterMerge.hpp"
00041 
00042 #include "apriori/Apriori.hpp"
00043 
00044 #include <vector>
00045 #include <iostream>
00046 #include <string>
00047 
00048 
00049 std::string file_format;
00050 
00051 void init()
00052 {
00053    file_format = "File format:";
00054    file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00055    file_format += "represents a transaction. \n";
00056    file_format += "A transaction is a set of items seperated by a nonnumeric ";
00057    file_format += "character.\nIt can be for example a white space, comma, ";
00058    file_format += "colon, etc.\n";
00059    file_format += "Items are nonnegative integers.\n";
00060 }
00062 void usage()
00063 {
00064    log_err(0,"Usage: deadend deadend-option transactionfile min_supp outcomefile [options]");
00065    log_err(0," deadend-option\t    option of dead-end pruning, i.e: on, off");
00066    log_err(0," transactionfile\t    file, that contains the tranasctions of items");
00067    log_err(0," outcomefile\t    file to write the outcome");
00068    log_err(0," min_supp\t    absolute support threshold");
00069 
00070    std::cerr << file_format;
00071    log_err(0,"\t\t\tHave a succesful mining ;-)\n\n");
00072 }
00073 
00085 int process_arguments( int argc, char *argv[], counter_t& min_supp, 
00086                        bool &isrel, double &relminsupp, unsigned int& maxsize )
00087 {
00088    if ( argc < 5 )
00089    {
00090      log_err(0,"There are 4 mandatory arguments.");
00091      return 2;
00092    }
00093    std::string mins=argv[3];
00094    if (mins[mins.size()-1]=='%') {
00095      mins.erase(mins.size()-1);
00096      isrel=true;
00097      relminsupp=atof(mins.c_str());
00098      relminsupp/=100;
00099      log_info(0,"Using relative minimum support of %lg",relminsupp);
00100      return 0;
00101    }
00102    isrel=false; relminsupp=0;
00103    int min_supp_i;
00104    try
00105    {
00106       convert(argv[3], min_supp_i);
00107       if ( min_supp_i <= 0  )
00108       {
00109          log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00110          return 3;
00111       }
00112    }
00113    catch(BadConversion e)
00114    {
00115       log_err(0,"min_supp conversion problem.");
00116       return 3;
00117    }
00118    min_supp = static_cast<counter_t>(min_supp_i);
00119    log_info(0,"min_supp is set to %d", min_supp);
00120    if(argc == 6)
00121    {
00122       int maxsize_i;
00123       try
00124       {
00125          convert(argv[5], maxsize_i);
00126          if ( maxsize_i <= 0  )
00127          {
00128             log_err(0,"%s cannot be converted to a positive integer.",argv[4]);
00129             return 4;
00130          }
00131       }
00132       catch(BadConversion e)
00133       {
00134          log_err(0,"max_size conversion problem.");
00135          return 4;
00136       }
00137       maxsize = static_cast<unsigned int>(maxsize_i);
00138       log_status(0,"maxsize is set to %d", maxsize);
00139    }
00140    else
00141       maxsize = largest_itemsetsize;
00142    return 0;
00143 }
00144 
00145 int main( int argc, char *argv[] )
00146 {
00147    init();
00148    counter_t min_supp;
00149    unsigned int maxsize;
00150    bool relative;
00151    double relminsupp;
00152       
00153    {
00154       int return_val = 
00155          process_arguments( argc, argv, min_supp, 
00156                             relative, relminsupp, maxsize );
00157       if(return_val)
00158          return return_val;
00159    }
00160 
00161    char *deadend_option=argv[1];
00162    char* input_file = argv[2];
00163    char* output_file = argv[4];
00164 
00165    try
00166    {
00167       // We assume that the transactions does not contain duplicates!!!
00168       typedef brBufferedTransactionReader< > T_R;
00169       // Otherwise uncomment this:
00170       // typedef SortedTransactionReader<brBufferedTransactionReader< >, true> T_R;
00171 
00172       T_R::params_t par_i;
00173       par_i.file_name = input_file;
00174       par_i.mode=FileReprBase::READ;
00175       par_i.file_buffer_size = 16 * 1024;
00176       T_R tr_reader(&par_i);
00177       std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00178       counter_t nr_of_transactions;
00179       // The first step of each algorithms is determining the frequent items.
00180       FrequentFilter<T_R>
00181          fr_filter(tr_reader);
00182       log_status(0,"Finding frequent items.");
00183       fr_filter.findFrequentItems( freq_items_with_counters,  
00184                                    nr_of_transactions, min_supp);
00185 
00186       if(!freq_items_with_counters.empty())
00187       {
00188          log_status(0,"Doing decoder.");
00189          typedef DFDecoderWithEEManagement< > DF_D;
00190 //          typedef DFDecoderWithEEManagement< SimpleDFDecoder<SortOutput<> > > DF_D;
00191 //          typedef DFDecoderWithEEManagement< StatOutput<CacheDFDecoder< >, true> > DF_D;
00192 
00193          DF_D::params_t par_d;
00194          par_d.file_name = output_file;
00195          par_d.mode=FileReprBase::WRITE;
00196 //          par_d.numfreq = freq_items_with_counters.size(); // If StatOutput is used!!!
00197          DF_D df_decoder(&par_d);
00198 
00199          log_status(0,"APRIORI is selected");
00200          typedef Bodon::LeafWithoutConstructor LEAF_WC;  
00201          typedef Bodon::Leaf LEAF;       
00202          typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE;
00203 //             typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelistDynLookup<std::vector<Edge>, 30 > > TRIE_OEL;
00204 
00205                  
00206          log_status(0,"NEE_FULL is disabled");
00207          log_status(0,"Low memory need option is on.");
00208          typedef SortedTransactionReader<Coder<T_R, DF_D> >  S_C;
00209          typedef Bodon::SupportCounterMerge<TRIE> SUPP_C_BASE;
00210          typedef OneByOneSupportCounter<TRIE, S_C, SUPP_C_BASE> SUPP_C;
00211          typedef Frequent2FilterOnline<S_C> F2F;
00212          
00213          S_C::params_t par_c;
00214          par_c.file_name = input_file;
00215          par_c.mode=FileReprBase::READ;
00216          par_c.largest_item = tr_reader.getLargestItem();
00217          par_c.decoder = &df_decoder;
00218          par_c.freq_items_with_counters = &freq_items_with_counters;
00219          par_c.codemode = ASC;
00220 //   par_c.codemode = DESC;
00221          log_status(0,"Doing sorted codec.");
00222          S_C sorted_coder(&par_c);
00223          
00224          std::vector< std::pair<counter_t, std::pair<item_t, item_t> > >
00225             freq_pairs_with_counters;
00226          F2F fr_2_filter(&sorted_coder );
00227          log_status(0,"Finding frequent pairs.")
00228             fr_2_filter.findFrequentPairs(freq_pairs_with_counters, min_supp);
00229 
00230          TRIE main_trie;
00231          typedef Bodon::FrequentItemInserter<DF_D, TRIE, NEE_Off> FII;
00232          FII fii(main_trie, df_decoder);
00233          typedef bracz::singleualloc<LEAF_WC, 64 * 1024> LEAF_ALLOCATOR;
00234          LEAF_ALLOCATOR s_alloc;
00235    
00236 
00237          log_status(0,"Simple pruning is selected.");
00238          typedef Bodon::FrequentPairInserter<DF_D, TRIE, LEAF_WC, 
00239             LEAF_ALLOCATOR, NEE_Off> FPI;
00240          if(strstr(deadend_option, "on"))
00241          {
00242             log_status(0,"deadend option is on.");
00243             typedef Bodon::inhomogeneous_trie::SimplePruner<
00244                DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off> PRUNER;
00245             typedef Bodon::inhomogeneous_trie::CandidateGeneratorPrune<PRUNER, DF_D, 
00246                TRIE, LEAF_ALLOCATOR, NEE_Off> CG;
00247             typedef Bodon::inhomogeneous_trie::InfreqRemover<
00248                DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off> IR;
00249             IR infrequent_remover(main_trie, df_decoder, s_alloc);
00250             typedef Apriori<S_C, DF_D, TRIE, LEAF_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00251             A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00252             log_status(0,"Finding frequent itemsets.");
00253             apriori.findFrequentItemsets( 
00254                nr_of_transactions, *par_c.freq_counters,
00255                freq_pairs_with_counters, min_supp, maxsize );
00256          }
00257          else
00258          {
00259             log_status(0,"deadend option is off.");
00260             const bool deadend = false;
00261             typedef Bodon::inhomogeneous_trie::SimplePruner<
00262                DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off, deadend> PRUNER;
00263             typedef Bodon::inhomogeneous_trie::CandidateGeneratorPrune<PRUNER, DF_D, 
00264                TRIE, LEAF_ALLOCATOR, NEE_Off, deadend> CG;
00265             typedef Bodon::inhomogeneous_trie::InfreqRemover<
00266                DF_D, TRIE, LEAF_WC, LEAF_ALLOCATOR, NEE_Off, deadend> IR;
00267             IR infrequent_remover(main_trie, df_decoder, s_alloc);
00268             typedef Apriori<S_C, DF_D, TRIE, LEAF_ALLOCATOR, FII, FPI, CG, IR, SUPP_C> A;
00269             A apriori(main_trie, s_alloc, infrequent_remover, sorted_coder, df_decoder, fii);
00270             log_status(0,"Finding frequent itemsets.");
00271             apriori.findFrequentItemsets( 
00272                nr_of_transactions, *par_c.freq_counters,
00273                freq_pairs_with_counters, min_supp, maxsize );
00274          }
00275 
00276 
00277       }
00278         
00279    }
00280    catch (std::ios_base::failure e)
00281    {
00282       log_err(0,"Exiting the program due to IO exception");
00283       return 1;
00284    }
00285 }
00286 
00287 

Generated on Sun Sep 17 17:50:38 2006 for FIM environment by  doxygen 1.4.4