00001 
00029 #include "common.hpp"
00030 #include "io/input/transaction_reader/brBufferedTransactionReader.hpp"
00031 
00032 #include "io/codec/decoder/df/CacheDFDecoder.hpp"
00033 
00034 
00035 #include "util/StreamParser.hpp"
00036 #include "util/SeqFrequentFilter.cpp"
00037 
00038 #include "datastructures/maxvector.hpp"
00039 #include "datastructures/trie/edgelist/OrderedEdgelist.hpp"
00040 
00041 
00042 #include "apriori/bodon/Trie.hpp"
00043 
00044 #include "apriori/SeqAprioriSelector.hpp"
00045 #include <vector>
00046 #include <iostream>
00047 #include <string>
00048 
00049 
00050 std::string file_format;
00051 
00052 void init()
00053 {
00054    file_format = "File format:";
00055    file_format += "\n\nThe transactionfile is a plan text file. Each row ";
00056    file_format += "represents a transaction. \n";
00057    file_format += "A transaction is a set of items seperated by a nonnumeric ";
00058    file_format += "character.\nIt can be for example a white space, comma, ";
00059    file_format += "colon, etc.\n";
00060    file_format += "Items are nonnegative integers.\n";
00061 }
00063 void usage()
00064 {
00065    std::cerr << "\nUsage: fsm algorithm transactionfile min_supp ";
00066    std::cerr << "outcomefile [maxsize]\n";
00067    std::cerr << "\n algorithm\t    the name of the algorithm, i.e:\n";
00068    std::cerr << "\t\t    apriori, apriori-noprune, apriori-intersectprune";
00069    std::cerr << "\n transactionfile    file, that contains the tranasctions of items";
00070    std::cerr << "\n outcomefile\t    file to write the outcome";
00071    std::cerr << "\n min_supp\t    absolute support threshold";
00072    std::cerr << "\n maxsize\t    the upper limit of the size of the frequent sets\n";
00073 
00074    std::cerr << file_format;
00075    std::cerr << "\n\nHave a succesful mining ;-)"<<std::endl<<std::endl;
00076 }
00077 
00088 int process_arguments( int argc, char *argv[], counter_t& min_supp, 
00089                        bool &isrel, double &relminsupp, unsigned int& maxsize )
00090 {
00091    if ( argc < 5 )
00092    {
00093      log_err(0,"There are 4 mandatory arguments.");
00094      usage();
00095      return 2;
00096    }
00097    std::string mins=argv[3];
00098    if (mins[mins.size()-1]=='%') {
00099      mins.erase(mins.size()-1);
00100      isrel=true;
00101      relminsupp=atof(mins.c_str());
00102      relminsupp/=100;
00103      log_status(0,"Using relative minimum support of %lg",relminsupp);
00104      return 0;
00105    }
00106    isrel=false;
00107    int min_supp_i;
00108    try
00109    {
00110       convert(argv[3], min_supp_i);
00111       if ( min_supp_i <= 0  )
00112       {
00113          log_err(0,"%s cannot be converted to a positive integer.",argv[3]);
00114          return 3;
00115       }
00116    }
00117    catch(BadConversion e)
00118    {
00119       log_err(0,"min_supp conversion problem.");
00120       return 3;
00121    }
00122    min_supp = static_cast<counter_t>(min_supp_i);
00123    log_status(0,"min_supp is set to %d", min_supp);
00124    if(argc == 6)
00125    {
00126       int maxsize_i;
00127       try
00128       {
00129          convert(argv[5], maxsize_i);
00130          if ( maxsize_i <= 0  )
00131          {
00132             log_err(0,"%s cannot be converted to a positive integer.",argv[5]);
00133             return 4;
00134          }
00135       }
00136       catch(BadConversion e)
00137       {
00138          log_err(0,"max_size conversion problem.");
00139          return 4;
00140       }
00141       maxsize = static_cast<unsigned int>(maxsize_i);
00142       log_status(0,"maxsize is set to %d", maxsize);
00143    }
00144    else
00145       maxsize = largest_itemsetsize;
00146    return 0;
00147 }
00148 
00149 int main( int argc, char *argv[] )
00150 {
00151    init();
00152    counter_t min_supp;
00153    unsigned int maxsize;
00154    bool relative;
00155    double relminsupp;
00156    
00157    {
00158       int return_val = 
00159          process_arguments( argc, argv, min_supp, relative, relminsupp, maxsize );
00160       if(return_val)
00161          return return_val;
00162    }
00163 
00164    if( strncmp(argv[1],"apriori",7) == 0 )
00165    {
00166 
00167       char* algorithm = argv[1];
00168       char* input_file = argv[2];
00169       char* output_file = argv[4];
00170 
00171       try
00172       {
00173          typedef brBufferedTransactionReader< > T_R;
00174          T_R::params_t par_i;
00175          par_i.file_name = input_file;
00176          par_i.mode=FileReprBase::READ;
00177          par_i.file_buffer_size = 16 * 1024;
00178          T_R tr_reader(&par_i);
00179          std::vector< std::pair<counter_t, item_t> > freq_items_with_counters;
00180          counter_t nr_of_transactions;
00181          
00182          SeqFrequentFilter<T_R>
00183             fr_filter(tr_reader);
00184          log_status(0,"Finding frequent items.");
00185          fr_filter.findFrequentItems( freq_items_with_counters,  
00186                                       nr_of_transactions, min_supp);
00187 
00188          log_status(0,"Doing decoder.");
00189          typedef CacheDFDecoder<  > DF_D;
00190          
00191 
00192          DF_D::params_t par_d;
00193          par_d.file_name = output_file;
00194          par_d.mode=FileReprBase::WRITE;
00195 
00196          DF_D df_decoder(&par_d);
00197          typedef Bodon::LeafWithoutConstructor LEAF_WC;  
00198          typedef Bodon::Leaf LEAF;       
00199          if(strncmp(argv[1],"apriori",7) == 0)
00200          {
00201             log_status(0,"APRIORI is selected");
00202 
00203             typedef Bodon::Trie< LEAF, Bodon::OrderedEdgelist<std::vector<Edge> > > TRIE_BASE;
00204 
00205          
00206             SeqAprioriSelector<TRIE_BASE, LEAF_WC, T_R, DF_D>( 
00207                min_supp, algorithm, input_file, nr_of_transactions, 
00208                freq_items_with_counters, tr_reader, df_decoder, maxsize );
00209          }
00210       }
00211          catch (std::ios_base::failure e)
00212          {
00213             log_err(0,"Exiting the program due to IO exception");
00214             return 1;
00215          }
00216    }
00217    else
00218    {
00219       usage();
00220       log_err(0,"algorithm should be apriori");
00221       return 1;
00222    }
00223 }
00224 
00225