Skylark (Sketching Library)  0.1
/var/lib/jenkins/jobs/Skylark/workspace/ml/model.hpp
Go to the documentation of this file.
00001 #ifndef SKYLARK_ML_MODEL_HPP
00002 #define SKYLARK_ML_MODEL_HPP
00003 
00004 #include <elemental.hpp>
00005 #include <skylark.hpp>
00006 #include <cmath>
00007 #include <boost/mpi.hpp>
00008 #include <sstream>
00009 #include <fstream>
00010 #include <iostream>
00011 #include <cstdlib>
00012 #include <string>
00013 #include <vector>
00014 #include "kernels.hpp"
00015 #include "options.hpp"
00016 
00017 #ifdef SKYLARK_HAVE_OPENMP
00018 #include <omp.h>
00019 #endif
00020 
00021 namespace skylark { namespace ml {
00022 
00023 int classification_accuracy(elem::Matrix<double>& Yt, elem::Matrix<double>& Yp) {
00024     int correct = 0;
00025     double o, o1;
00026     int pred;
00027 
00028 
00029     for(int i=0; i < Yp.Height(); i++) {
00030         o = Yp.Get(i,0);
00031         pred = 0;
00032         if (Yp.Width()==1)
00033             pred = (o >= 0)? +1:-1;
00034 
00035         for(int j=1; j < Yp.Width(); j++) {
00036             o1 = Yp.Get(i,j);
00037             if ( o1 > o) {
00038                 o = o1;
00039                 pred = j;
00040             }
00041         }
00042 
00043         if(pred == (int) Yt.Get(i,0))
00044             correct++;
00045     }
00046     return correct;
00047 }
00048 
00049 template <typename InputType, typename OutputType>
00050 struct model_t
00051 {
00052 public:
00053     typedef InputType input_type;
00054     typedef OutputType output_type;
00055 
00056     // TODO the following two should depend on the input type
00057     // TODO explicit doubles is not desired.
00058     typedef elem::Matrix<double> intermediate_type;
00059     typedef elem::Matrix<double> coef_type;
00060 
00061     typedef skylark::sketch::sketch_transform_t<input_type, intermediate_type>
00062     feature_transform_type;
00063 
00064     model_t(std::vector<const feature_transform_type *>& maps, bool scale_maps,
00065         int num_features, int num_outputs) :
00066         _coef(num_features, num_outputs), _maps(maps), _scale_maps(scale_maps),
00067         _starts(maps.size()), _finishes(maps.size()) {
00068 
00069         // TODO verify all N dimension of the maps match
00070 
00071         elem::MakeZeros(_coef);
00072 
00073         int nf = 0;
00074         for(int i = 0; i < _maps.size(); i++) {
00075             _starts[i] = nf;
00076             _finishes[i] = nf + _maps[i]->get_S() - 1;
00077             nf += _maps[i]->get_S();
00078         }
00079 
00080         _num_input_features = (_maps.size() == 0) ?
00081             num_features : _maps[0]->get_N();
00082     }
00083 
00084     model_t(const boost::property_tree::ptree &pt) {
00085         build_from_ptree(pt);
00086     }
00087 
00088     model_t(const std::string& fname) {
00089         std::ifstream is(fname);
00090 
00091         // Skip all lines begining with "#"
00092         while(is.peek() == '#')
00093             is.ignore(std::numeric_limits<std::streamsize>::max(), '\n');
00094 
00095         boost::property_tree::ptree pt;
00096         boost::property_tree::read_json(is, pt);
00097         is.close();
00098         build_from_ptree(pt);
00099     }
00100 
00101     boost::property_tree::ptree to_ptree() const {
00102         boost::property_tree::ptree pt;
00103         pt.put("skylark_object_type", "model:linear-on-features");
00104         pt.put("skylark_version", VERSION);
00105 
00106         pt.put("num_features", _coef.Height());
00107         pt.put("num_outputs", _coef.Width());
00108         pt.put("num_input_features", _num_input_features);
00109 
00110         boost::property_tree::ptree ptfmap;
00111         ptfmap.put("number_maps", _maps.size());
00112         ptfmap.put("scale_maps", _scale_maps);
00113 
00114         boost::property_tree::ptree ptmaps;
00115         for(int i = 0; i < _maps.size(); i++)
00116             ptmaps.push_back(std::make_pair(std::to_string(i),
00117                     _maps[i]->to_ptree()));
00118         ptfmap.add_child("maps", ptmaps);
00119 
00120         pt.add_child("feature_mapping", ptfmap);
00121 
00122         std::stringstream scoef;
00123         elem::Print(_coef, "", scoef);
00124         pt.put("coef_matrix", scoef.str());
00125 
00126         return pt;
00127     }
00128 
00133     void save(const std::string& fname, const std::string& header) const {
00134         boost::property_tree::ptree pt = to_ptree();
00135         std::ofstream of(fname);
00136         of << header;
00137         boost::property_tree::write_json(of, pt);
00138         of.close();
00139     }
00140 
00141     void predict(input_type& X, output_type& PV, output_type& DV,
00142         int num_threads = 1) const {
00143 
00144         int d = base::Height(X);
00145         int k = base::Width(_coef);
00146         int n = base::Width(X);
00147 
00148         if (_maps.size() == 0)  {
00149             // No maps (linear case)
00150 
00151             DV.Resize(n, k);
00152             base::Gemm(elem::TRANSPOSE,elem::NORMAL,1.0, X, _coef, 0.0, DV);
00153         } else {
00154             // Non-linear case
00155 
00156             coef_type Wslice;
00157             int j, start, finish, sj;
00158 
00159             elem::Zeros(DV, n, k);
00160 #           ifdef SKYLARK_HAVE_OPENMP
00161 #           pragma omp parallel for if(num_threads > 1) private(j, start, finish, sj) num_threads(num_threads)
00162 #           endif
00163             for(j = 0; j < _maps.size(); j++) {
00164                 start = _starts[j];
00165                 finish = _finishes[j];
00166                 sj = finish - start  + 1;
00167 
00168                 intermediate_type z(sj, n);
00169                 _maps[j]->apply(X, z, sketch::columnwise_tag());
00170 
00171                 if (_scale_maps)
00172                     elem::Scal(sqrt(double(sj) / d), z);
00173 
00174                 output_type o(n, k);
00175 
00176                 elem::LockedView(Wslice, _coef, start, 0, sj, k);
00177                 base::Gemm(elem::TRANSPOSE, elem::NORMAL, 1.0, z, Wslice, o);
00178 
00179 #               ifdef SKYLARK_HAVE_OPENMP
00180 #               pragma omp critical
00181 #               endif
00182                 base::Axpy(+1.0, o, DV);
00183             }
00184         }
00185 
00186         double o, o1, pred;
00187         for(int i=0; i < DV.Height(); i++) {
00188             o = DV.Get(i,0);
00189             pred = 0;
00190             if (DV.Width()==1)
00191                 pred = (o >= 0)? +1:-1;
00192 
00193             for(int j=1; j < DV.Width(); j++) {
00194                 o1 = DV.Get(i,j);
00195                 if ( o1 > o) {
00196                     o = o1;
00197                     pred = j;
00198                 }
00199             }
00200 
00201             PV.Set(i,0, pred);
00202         }
00203     }
00204 
00205     void get_probabilities(input_type& X, output_type& P, int num_threads = 1)
00206         const;
00207     coef_type& get_coef() { return _coef; }
00208     static double evaluate(output_type& Yt, output_type& Yp,
00209         const boost::mpi::communicator& comm);
00210 
00211     int get_num_outputs() const { return _coef.Width(); }
00212     int get_input_size() const { return _num_input_features; }
00213 
00214 protected:
00215 
00216     void build_from_ptree(const boost::property_tree::ptree &pt) {
00217         int num_features = pt.get<int>("num_features");
00218         int num_outputs = pt.get<int>("num_outputs");
00219         _coef.Resize(num_features, num_outputs);
00220 
00221         _num_input_features = pt.get<int>("num_input_features");
00222 
00223         int num_maps = pt.get<int>("feature_mapping.number_maps");
00224         _maps.resize(num_maps);
00225         const boost::property_tree::ptree &ptmaps =
00226             pt.get_child("feature_mapping.maps");
00227         for(int i = 0; i < num_maps; i++)
00228             _maps[i] =
00229                 feature_transform_type::from_ptree(
00230                    ptmaps.get_child(std::to_string(i)));
00231 
00232         int nf = 0;
00233         _starts.resize(num_maps);
00234         _finishes.resize(num_maps);
00235         for(int i = 0; i < _maps.size(); i++) {
00236             _starts[i] = nf;
00237             _finishes[i] = nf + _maps[i]->get_S() - 1;
00238             nf += _maps[i]->get_S();
00239         }
00240 
00241         _scale_maps = pt.get<bool>("feature_mapping.scale_maps");
00242 
00243         std::istringstream coef_str(pt.get<std::string>("coef_matrix"));
00244         double *buffer = _coef.Buffer();
00245         int ldim = _coef.LDim();
00246         for(int i = 0; i < num_features; i++) {
00247             std::string line;
00248             std::getline(coef_str, line);
00249             std::istringstream coefstream(line);
00250             for(int j = 0; j < num_outputs; j++) {
00251                 std::string token;
00252                 coefstream >> token;
00253                 buffer[i + j * ldim] = atof(token.c_str());
00254             }
00255         }
00256     }
00257 
00258 private:
00259     coef_type _coef;
00260     int _num_input_features;
00261     std::vector<const feature_transform_type *> _maps; // TODO use shared_ptr
00262     bool _scale_maps;
00263 
00264     std::vector<int> _starts, _finishes;
00265 };
00266 
00267 template <typename InputType, typename OutputType>
00268 double model_t<InputType, OutputType>::evaluate(OutputType& Yt,
00269     OutputType& Yp, const boost::mpi::communicator& comm) {
00270 
00271     int rank = comm.rank();
00272 
00273     int correct = classification_accuracy(Yt, Yp);
00274     double accuracy = 0.0;
00275     int totalcorrect, total;
00276     boost::mpi::reduce(comm, correct, totalcorrect, std::plus<double>(), 0);
00277     boost::mpi::reduce(comm, Yt.Height(), total, std::plus<int>(), 0);
00278 
00279     if(rank ==0)
00280         accuracy =  totalcorrect*100.0/total;
00281     return accuracy;
00282 }
00283 
00284 
00285 } }
00286 
00287 
00288 #endif /* SKYLARK_ML_MODEL_HPP */