Skylark (Sketching Library)
0.1
|
00001 #ifndef SKYLARK_ML_MODEL_HPP 00002 #define SKYLARK_ML_MODEL_HPP 00003 00004 #include <elemental.hpp> 00005 #include <skylark.hpp> 00006 #include <cmath> 00007 #include <boost/mpi.hpp> 00008 #include <sstream> 00009 #include <fstream> 00010 #include <iostream> 00011 #include <cstdlib> 00012 #include <string> 00013 #include <vector> 00014 #include "kernels.hpp" 00015 #include "options.hpp" 00016 00017 #ifdef SKYLARK_HAVE_OPENMP 00018 #include <omp.h> 00019 #endif 00020 00021 namespace skylark { namespace ml { 00022 00023 int classification_accuracy(elem::Matrix<double>& Yt, elem::Matrix<double>& Yp) { 00024 int correct = 0; 00025 double o, o1; 00026 int pred; 00027 00028 00029 for(int i=0; i < Yp.Height(); i++) { 00030 o = Yp.Get(i,0); 00031 pred = 0; 00032 if (Yp.Width()==1) 00033 pred = (o >= 0)? +1:-1; 00034 00035 for(int j=1; j < Yp.Width(); j++) { 00036 o1 = Yp.Get(i,j); 00037 if ( o1 > o) { 00038 o = o1; 00039 pred = j; 00040 } 00041 } 00042 00043 if(pred == (int) Yt.Get(i,0)) 00044 correct++; 00045 } 00046 return correct; 00047 } 00048 00049 template <typename InputType, typename OutputType> 00050 struct model_t 00051 { 00052 public: 00053 typedef InputType input_type; 00054 typedef OutputType output_type; 00055 00056 // TODO the following two should depend on the input type 00057 // TODO explicit doubles is not desired. 00058 typedef elem::Matrix<double> intermediate_type; 00059 typedef elem::Matrix<double> coef_type; 00060 00061 typedef skylark::sketch::sketch_transform_t<input_type, intermediate_type> 00062 feature_transform_type; 00063 00064 model_t(std::vector<const feature_transform_type *>& maps, bool scale_maps, 00065 int num_features, int num_outputs) : 00066 _coef(num_features, num_outputs), _maps(maps), _scale_maps(scale_maps), 00067 _starts(maps.size()), _finishes(maps.size()) { 00068 00069 // TODO verify all N dimension of the maps match 00070 00071 elem::MakeZeros(_coef); 00072 00073 int nf = 0; 00074 for(int i = 0; i < _maps.size(); i++) { 00075 _starts[i] = nf; 00076 _finishes[i] = nf + _maps[i]->get_S() - 1; 00077 nf += _maps[i]->get_S(); 00078 } 00079 00080 _num_input_features = (_maps.size() == 0) ? 00081 num_features : _maps[0]->get_N(); 00082 } 00083 00084 model_t(const boost::property_tree::ptree &pt) { 00085 build_from_ptree(pt); 00086 } 00087 00088 model_t(const std::string& fname) { 00089 std::ifstream is(fname); 00090 00091 // Skip all lines begining with "#" 00092 while(is.peek() == '#') 00093 is.ignore(std::numeric_limits<std::streamsize>::max(), '\n'); 00094 00095 boost::property_tree::ptree pt; 00096 boost::property_tree::read_json(is, pt); 00097 is.close(); 00098 build_from_ptree(pt); 00099 } 00100 00101 boost::property_tree::ptree to_ptree() const { 00102 boost::property_tree::ptree pt; 00103 pt.put("skylark_object_type", "model:linear-on-features"); 00104 pt.put("skylark_version", VERSION); 00105 00106 pt.put("num_features", _coef.Height()); 00107 pt.put("num_outputs", _coef.Width()); 00108 pt.put("num_input_features", _num_input_features); 00109 00110 boost::property_tree::ptree ptfmap; 00111 ptfmap.put("number_maps", _maps.size()); 00112 ptfmap.put("scale_maps", _scale_maps); 00113 00114 boost::property_tree::ptree ptmaps; 00115 for(int i = 0; i < _maps.size(); i++) 00116 ptmaps.push_back(std::make_pair(std::to_string(i), 00117 _maps[i]->to_ptree())); 00118 ptfmap.add_child("maps", ptmaps); 00119 00120 pt.add_child("feature_mapping", ptfmap); 00121 00122 std::stringstream scoef; 00123 elem::Print(_coef, "", scoef); 00124 pt.put("coef_matrix", scoef.str()); 00125 00126 return pt; 00127 } 00128 00133 void save(const std::string& fname, const std::string& header) const { 00134 boost::property_tree::ptree pt = to_ptree(); 00135 std::ofstream of(fname); 00136 of << header; 00137 boost::property_tree::write_json(of, pt); 00138 of.close(); 00139 } 00140 00141 void predict(input_type& X, output_type& PV, output_type& DV, 00142 int num_threads = 1) const { 00143 00144 int d = base::Height(X); 00145 int k = base::Width(_coef); 00146 int n = base::Width(X); 00147 00148 if (_maps.size() == 0) { 00149 // No maps (linear case) 00150 00151 DV.Resize(n, k); 00152 base::Gemm(elem::TRANSPOSE,elem::NORMAL,1.0, X, _coef, 0.0, DV); 00153 } else { 00154 // Non-linear case 00155 00156 coef_type Wslice; 00157 int j, start, finish, sj; 00158 00159 elem::Zeros(DV, n, k); 00160 # ifdef SKYLARK_HAVE_OPENMP 00161 # pragma omp parallel for if(num_threads > 1) private(j, start, finish, sj) num_threads(num_threads) 00162 # endif 00163 for(j = 0; j < _maps.size(); j++) { 00164 start = _starts[j]; 00165 finish = _finishes[j]; 00166 sj = finish - start + 1; 00167 00168 intermediate_type z(sj, n); 00169 _maps[j]->apply(X, z, sketch::columnwise_tag()); 00170 00171 if (_scale_maps) 00172 elem::Scal(sqrt(double(sj) / d), z); 00173 00174 output_type o(n, k); 00175 00176 elem::LockedView(Wslice, _coef, start, 0, sj, k); 00177 base::Gemm(elem::TRANSPOSE, elem::NORMAL, 1.0, z, Wslice, o); 00178 00179 # ifdef SKYLARK_HAVE_OPENMP 00180 # pragma omp critical 00181 # endif 00182 base::Axpy(+1.0, o, DV); 00183 } 00184 } 00185 00186 double o, o1, pred; 00187 for(int i=0; i < DV.Height(); i++) { 00188 o = DV.Get(i,0); 00189 pred = 0; 00190 if (DV.Width()==1) 00191 pred = (o >= 0)? +1:-1; 00192 00193 for(int j=1; j < DV.Width(); j++) { 00194 o1 = DV.Get(i,j); 00195 if ( o1 > o) { 00196 o = o1; 00197 pred = j; 00198 } 00199 } 00200 00201 PV.Set(i,0, pred); 00202 } 00203 } 00204 00205 void get_probabilities(input_type& X, output_type& P, int num_threads = 1) 00206 const; 00207 coef_type& get_coef() { return _coef; } 00208 static double evaluate(output_type& Yt, output_type& Yp, 00209 const boost::mpi::communicator& comm); 00210 00211 int get_num_outputs() const { return _coef.Width(); } 00212 int get_input_size() const { return _num_input_features; } 00213 00214 protected: 00215 00216 void build_from_ptree(const boost::property_tree::ptree &pt) { 00217 int num_features = pt.get<int>("num_features"); 00218 int num_outputs = pt.get<int>("num_outputs"); 00219 _coef.Resize(num_features, num_outputs); 00220 00221 _num_input_features = pt.get<int>("num_input_features"); 00222 00223 int num_maps = pt.get<int>("feature_mapping.number_maps"); 00224 _maps.resize(num_maps); 00225 const boost::property_tree::ptree &ptmaps = 00226 pt.get_child("feature_mapping.maps"); 00227 for(int i = 0; i < num_maps; i++) 00228 _maps[i] = 00229 feature_transform_type::from_ptree( 00230 ptmaps.get_child(std::to_string(i))); 00231 00232 int nf = 0; 00233 _starts.resize(num_maps); 00234 _finishes.resize(num_maps); 00235 for(int i = 0; i < _maps.size(); i++) { 00236 _starts[i] = nf; 00237 _finishes[i] = nf + _maps[i]->get_S() - 1; 00238 nf += _maps[i]->get_S(); 00239 } 00240 00241 _scale_maps = pt.get<bool>("feature_mapping.scale_maps"); 00242 00243 std::istringstream coef_str(pt.get<std::string>("coef_matrix")); 00244 double *buffer = _coef.Buffer(); 00245 int ldim = _coef.LDim(); 00246 for(int i = 0; i < num_features; i++) { 00247 std::string line; 00248 std::getline(coef_str, line); 00249 std::istringstream coefstream(line); 00250 for(int j = 0; j < num_outputs; j++) { 00251 std::string token; 00252 coefstream >> token; 00253 buffer[i + j * ldim] = atof(token.c_str()); 00254 } 00255 } 00256 } 00257 00258 private: 00259 coef_type _coef; 00260 int _num_input_features; 00261 std::vector<const feature_transform_type *> _maps; // TODO use shared_ptr 00262 bool _scale_maps; 00263 00264 std::vector<int> _starts, _finishes; 00265 }; 00266 00267 template <typename InputType, typename OutputType> 00268 double model_t<InputType, OutputType>::evaluate(OutputType& Yt, 00269 OutputType& Yp, const boost::mpi::communicator& comm) { 00270 00271 int rank = comm.rank(); 00272 00273 int correct = classification_accuracy(Yt, Yp); 00274 double accuracy = 0.0; 00275 int totalcorrect, total; 00276 boost::mpi::reduce(comm, correct, totalcorrect, std::plus<double>(), 0); 00277 boost::mpi::reduce(comm, Yt.Height(), total, std::plus<int>(), 0); 00278 00279 if(rank ==0) 00280 accuracy = totalcorrect*100.0/total; 00281 return accuracy; 00282 } 00283 00284 00285 } } 00286 00287 00288 #endif /* SKYLARK_ML_MODEL_HPP */