Skylark (Sketching Library)
0.1
|
00001 #ifndef SKYLARK_HASH_TRANSFORM_MIXED_HPP 00002 #define SKYLARK_HASH_TRANSFORM_MIXED_HPP 00003 00004 #include <map> 00005 #include <boost/serialization/map.hpp> 00006 00007 #include "../utility/external/combblas_comm_grid.hpp" 00008 00009 namespace skylark { namespace sketch { 00010 00011 //FIXME: 00012 // - Benchmark one-sided vs. col/row comm (or midpoint scheme): 00013 // Most likely the scheme depends on the output Elemental distribution, 00014 // here we use the same comm-scheme for all output types. 00015 // - Processing Sparse matrix in blocks? 00016 // - MPI-3 stuff, see: Enabling highly-scalable remote memory access 00017 // programming with MPI-3 one sided, R. Gerstenbergerm, M. Besta, and 00018 // T. Hoefler. 00019 00020 00021 /* Specialization: SpParMat for input, distributed Elemental for output */ 00022 template <typename IndexType, 00023 typename ValueType, 00024 elem::Distribution ColDist, 00025 elem::Distribution RowDist, 00026 template <typename> class IdxDistributionType, 00027 template <typename> class ValueDistribution> 00028 struct hash_transform_t < 00029 SpParMat<IndexType, ValueType, SpDCCols<IndexType, ValueType> >, 00030 elem::DistMatrix<ValueType, ColDist, RowDist>, 00031 IdxDistributionType, 00032 ValueDistribution > : 00033 public hash_transform_data_t<IdxDistributionType, 00034 ValueDistribution> { 00035 typedef IndexType index_type; 00036 typedef ValueType value_type; 00037 typedef SpDCCols< index_type, value_type > col_t; 00038 typedef FullyDistVec< index_type, value_type> mpi_vector_t; 00039 typedef SpParMat< index_type, value_type, col_t > matrix_type; 00040 typedef elem::DistMatrix< value_type, ColDist, RowDist > output_matrix_type; 00041 typedef hash_transform_data_t<IdxDistributionType, 00042 ValueDistribution> data_type; 00043 00044 00048 hash_transform_t (int N, int S, base::context_t& context) : 00049 data_type(N, S, context) { 00050 00051 } 00052 00056 template <typename InputMatrixType, 00057 typename OutputMatrixType> 00058 hash_transform_t (hash_transform_t<InputMatrixType, 00059 OutputMatrixType, 00060 IdxDistributionType, 00061 ValueDistribution>& other) : 00062 data_type(other) {} 00063 00067 hash_transform_t (hash_transform_data_t<IdxDistributionType, 00068 ValueDistribution>& other_data) : 00069 data_type(other_data) {} 00070 00071 template <typename Dimension> 00072 void apply (const matrix_type &A, output_matrix_type &sketch_of_A, 00073 Dimension dimension) const { 00074 try { 00075 apply_impl (A, sketch_of_A, dimension); 00076 } catch(boost::mpi::exception e) { 00077 SKYLARK_THROW_EXCEPTION ( 00078 base::mpi_exception() 00079 << base::error_msg(e.what()) ); 00080 } catch (std::string e) { 00081 SKYLARK_THROW_EXCEPTION ( 00082 base::combblas_exception() 00083 << base::error_msg(e) ); 00084 } catch (std::logic_error e) { 00085 SKYLARK_THROW_EXCEPTION ( 00086 base::combblas_exception() 00087 << base::error_msg(e.what()) ); 00088 } catch (std::bad_alloc e) { 00089 SKYLARK_THROW_EXCEPTION ( 00090 base::skylark_exception() 00091 << base::error_msg("bad_alloc: out of memory") ); 00092 } 00093 } 00094 00095 00096 private: 00100 template <typename Dimension> 00101 void apply_impl (const matrix_type &A_, 00102 output_matrix_type &sketch_of_A, 00103 Dimension dist) const { 00104 00105 // We are essentially doing a 'const' access to A, but the necessary, 00106 // 'const' option is missing from the interface 00107 matrix_type &A = const_cast<matrix_type&>(A_); 00108 00109 const size_t rank = A.getcommgrid()->GetRank(); 00110 00111 // extract columns of matrix 00112 col_t &data = A.seq(); 00113 00114 const size_t ncols = sketch_of_A.Width(); 00115 00116 const size_t my_row_offset = utility::cb_my_row_offset(A); 00117 const size_t my_col_offset = utility::cb_my_col_offset(A); 00118 00119 size_t comm_size = A.getcommgrid()->GetSize(); 00120 std::vector< std::set<size_t> > proc_set(comm_size); 00121 00122 // pre-compute processor targets of local sketch application 00123 for(typename col_t::SpColIter col = data.begcol(); 00124 col != data.endcol(); col++) { 00125 for(typename col_t::SpColIter::NzIter nz = data.begnz(col); 00126 nz != data.endnz(col); nz++) { 00127 00128 // compute global row and column id, and compress in one 00129 // target position index 00130 const index_type rowid = nz.rowid() + my_row_offset; 00131 const index_type colid = col.colid() + my_col_offset; 00132 const size_t pos = getPos(rowid, colid, ncols, dist); 00133 00134 // compute target processor for this target index 00135 const size_t target = 00136 sketch_of_A.Owner(pos / ncols, pos % ncols); 00137 00138 if(proc_set[target].count(pos) == 0) { 00139 assert(target < comm_size); 00140 proc_set[target].insert(pos); 00141 } 00142 } 00143 } 00144 00145 // constructing array holding start/end indices for one-sided access 00146 std::vector<index_type> proc_start_idx(comm_size + 1, 0); 00147 for(size_t i = 1; i < comm_size + 1; ++i) 00148 proc_start_idx[i] = proc_start_idx[i-1] + proc_set[i-1].size(); 00149 00150 // total number of nnz that will result when applying sketch locally 00151 std::vector<index_type> indicies(proc_start_idx[comm_size], 0); 00152 std::vector<value_type> values(proc_start_idx[comm_size], 0); 00153 00154 // Apply sketch for all local values. Note that some of the resulting 00155 // values might end up on a different processor. The data structure 00156 // fills values (sorted by processor id) in one continuous array. 00157 // Subsequently, one-sided operations can be used to access values for 00158 // each processor. 00159 for(typename col_t::SpColIter col = data.begcol(); 00160 col != data.endcol(); col++) { 00161 for(typename col_t::SpColIter::NzIter nz = data.begnz(col); 00162 nz != data.endnz(col); nz++) { 00163 00164 // compute global row and column id, and compress in one 00165 // target position index 00166 const index_type rowid = nz.rowid() + my_row_offset; 00167 const index_type colid = col.colid() + my_col_offset; 00168 const size_t pos = getPos(rowid, colid, ncols, dist); 00169 00170 // compute target processor for this target index 00171 const size_t proc = sketch_of_A.Owner(pos / ncols, pos % ncols); 00172 00173 // get offset in array for current element 00174 const size_t ar_idx = proc_start_idx[proc] + 00175 std::distance(proc_set[proc].begin(), proc_set[proc].find(pos)); 00176 00177 indicies[ar_idx] = pos; 00178 values[ar_idx] += nz.value() * 00179 data_type::getValue(rowid, colid, dist); 00180 } 00181 } 00182 00183 // Creating windows for all relevant arrays 00184 boost::mpi::communicator comm = utility::get_communicator(A); 00185 00186 // tell MPI that we will not use locks 00187 MPI_Info info; 00188 MPI_Info_create(&info); 00189 MPI_Info_set(info, "no_locks", "true"); 00190 00191 MPI_Win start_offset_win, idx_win, val_win; 00192 00193 MPI_Win_create(&proc_start_idx[0], sizeof(size_t) * (comm_size + 1), 00194 sizeof(size_t), info, comm, &start_offset_win); 00195 00196 MPI_Win_create(&indicies[0], sizeof(index_type) * indicies.size(), 00197 sizeof(index_type), info, comm, &idx_win); 00198 00199 MPI_Win_create(&values[0], sizeof(value_type) * values.size(), 00200 sizeof(value_type), info, comm, &val_win); 00201 00202 MPI_Info_free(&info); 00203 00204 // Synchronize epoch, no subsequent put operations (read only) and no 00205 // preceding fence calls. 00206 MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, start_offset_win); 00207 MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, idx_win); 00208 MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOPRECEDE, val_win); 00209 00210 00211 // accumulate values from other procs 00212 for(size_t p = 0; p < comm_size; ++p) { 00213 00214 // get the start/end offset 00215 std::vector<size_t> offset(2); 00216 MPI_Get(&(offset[0]), 2, boost::mpi::get_mpi_datatype<size_t>(), 00217 p, rank, 2, boost::mpi::get_mpi_datatype<size_t>(), 00218 start_offset_win); 00219 00220 MPI_Win_fence(MPI_MODE_NOPUT, start_offset_win); 00221 size_t num_values = offset[1] - offset[0]; 00222 00223 // and fill indices/values. 00224 std::vector<index_type> add_idx(num_values); 00225 std::vector<value_type> add_val(num_values); 00226 MPI_Get(&(add_idx[0]), num_values, 00227 boost::mpi::get_mpi_datatype<index_type>(), p, offset[0], 00228 num_values, boost::mpi::get_mpi_datatype<index_type>(), 00229 idx_win); 00230 00231 MPI_Get(&(add_val[0]), num_values, 00232 boost::mpi::get_mpi_datatype<value_type>(), p, offset[0], 00233 num_values, boost::mpi::get_mpi_datatype<value_type>(), 00234 val_win); 00235 00236 MPI_Win_fence(MPI_MODE_NOPUT, idx_win); 00237 MPI_Win_fence(MPI_MODE_NOPUT, val_win); 00238 00239 // finally, set data in local buffer 00240 for(size_t i = 0; i < num_values; ++i) { 00241 index_type lrow = sketch_of_A.LocalRow(add_idx[i] / ncols); 00242 index_type lcol = sketch_of_A.LocalCol(add_idx[i] % ncols); 00243 sketch_of_A.UpdateLocal(lrow, lcol, add_val[i]); 00244 } 00245 } 00246 00247 MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, start_offset_win); 00248 MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, idx_win); 00249 MPI_Win_fence(MPI_MODE_NOPUT | MPI_MODE_NOSUCCEED, val_win); 00250 00251 MPI_Win_free(&start_offset_win); 00252 MPI_Win_free(&idx_win); 00253 MPI_Win_free(&val_win); 00254 } 00255 00256 inline index_type getPos(index_type rowid, index_type colid, size_t ncols, 00257 columnwise_tag) const { 00258 return colid + ncols * data_type::row_idx[rowid]; 00259 } 00260 00261 inline index_type getPos(index_type rowid, index_type colid, size_t ncols, 00262 rowwise_tag) const { 00263 return rowid * ncols + data_type::row_idx[colid]; 00264 } 00265 }; 00266 00267 /* Specialization: SpParMat for input, Local Elemental output */ 00268 template <typename IndexType, 00269 typename ValueType, 00270 template <typename> class IdxDistributionType, 00271 template <typename> class ValueDistribution> 00272 struct hash_transform_t < 00273 SpParMat<IndexType, ValueType, SpDCCols<IndexType, ValueType> >, 00274 elem::Matrix<ValueType>, 00275 IdxDistributionType, 00276 ValueDistribution > : 00277 public hash_transform_data_t<IdxDistributionType, 00278 ValueDistribution> { 00279 typedef IndexType index_type; 00280 typedef ValueType value_type; 00281 typedef SpDCCols< index_type, value_type > col_t; 00282 typedef FullyDistVec< index_type, value_type> mpi_vector_t; 00283 typedef SpParMat< index_type, value_type, col_t > matrix_type; 00284 typedef elem::Matrix< value_type > output_matrix_type; 00285 typedef hash_transform_data_t<IdxDistributionType, 00286 ValueDistribution> data_type; 00287 00288 00292 hash_transform_t (int N, int S, base::context_t& context) : 00293 data_type(N, S, context) { 00294 00295 } 00296 00300 template <typename InputMatrixType, 00301 typename OutputMatrixType> 00302 hash_transform_t (hash_transform_t<InputMatrixType, 00303 OutputMatrixType, 00304 IdxDistributionType, 00305 ValueDistribution>& other) : 00306 data_type(other) {} 00307 00311 hash_transform_t (hash_transform_data_t<IdxDistributionType, 00312 ValueDistribution>& other_data) : 00313 data_type(other_data) {} 00314 00315 template <typename Dimension> 00316 void apply (const matrix_type &A, output_matrix_type &sketch_of_A, 00317 Dimension dimension) const { 00318 try { 00319 apply_impl (A, sketch_of_A, dimension); 00320 } catch(boost::mpi::exception e) { 00321 SKYLARK_THROW_EXCEPTION ( 00322 base::mpi_exception() 00323 << base::error_msg(e.what()) ); 00324 } catch (std::string e) { 00325 SKYLARK_THROW_EXCEPTION ( 00326 base::combblas_exception() 00327 << base::error_msg(e) ); 00328 } catch (std::logic_error e) { 00329 SKYLARK_THROW_EXCEPTION ( 00330 base::combblas_exception() 00331 << base::error_msg(e.what()) ); 00332 } catch (std::bad_alloc e) { 00333 SKYLARK_THROW_EXCEPTION ( 00334 base::skylark_exception() 00335 << base::error_msg("bad_alloc: out of memory") ); 00336 } 00337 } 00338 00339 00340 private: 00344 template <typename Dimension> 00345 void apply_impl (const matrix_type &A_, 00346 output_matrix_type &sketch_of_A, 00347 Dimension dist) const { 00348 00349 // We are essentially doing a 'const' access to A, but the necessary, 00350 // 'const' option is missing from the interface 00351 matrix_type &A = const_cast<matrix_type&>(A_); 00352 00353 const size_t rank = A.getcommgrid()->GetRank(); 00354 00355 // extract columns of matrix 00356 col_t &data = A.seq(); 00357 00358 const size_t my_row_offset = utility::cb_my_row_offset(A); 00359 const size_t my_col_offset = utility::cb_my_col_offset(A); 00360 00361 int n_res_cols = A.getncol(); 00362 int n_res_rows = A.getnrow(); 00363 data_type::get_res_size(n_res_rows, n_res_cols, dist); 00364 00365 // Apply sketch for all local values. Subsequently, all values are 00366 // gathered on processor 0 and the local matrix is populated. 00367 typedef std::map<index_type, value_type> col_values_t; 00368 col_values_t col_values; 00369 for(typename col_t::SpColIter col = data.begcol(); 00370 col != data.endcol(); col++) { 00371 for(typename col_t::SpColIter::NzIter nz = data.begnz(col); 00372 nz != data.endnz(col); nz++) { 00373 00374 index_type rowid = nz.rowid() + my_row_offset; 00375 index_type colid = col.colid() + my_col_offset; 00376 00377 const value_type value = 00378 nz.value() * data_type::getValue(rowid, colid, dist); 00379 data_type::finalPos(rowid, colid, dist); 00380 col_values[colid * n_res_rows + rowid] += value; 00381 } 00382 } 00383 00384 std::vector< std::map<index_type, value_type > > result; 00385 boost::mpi::gather(utility::get_communicator(A), col_values, result, 0); 00386 00387 if(rank == 0) { 00388 typedef typename std::map<index_type, value_type>::iterator itr_t; 00389 for(size_t i = 0; i < result.size(); ++i) { 00390 itr_t proc_itr = result[i].begin(); 00391 for(; proc_itr != result[i].end(); proc_itr++) { 00392 int row = proc_itr->first % n_res_rows; 00393 int col = proc_itr->first / n_res_rows; 00394 sketch_of_A.Update(row, col, proc_itr->second); 00395 } 00396 } 00397 } 00398 } 00399 }; 00400 00401 00402 #if 0 00403 /* Specialization: SpParMat for input, Elemental[* / *] output */ 00404 template <typename IndexType, 00405 typename ValueType, 00406 template <typename> class IdxDistributionType, 00407 template <typename> class ValueDistribution> 00408 struct hash_transform_t < 00409 SpParMat<IndexType, ValueType, SpDCCols<IndexType, ValueType> >, 00410 elem::DistMatrix<ValueType, elem::STAR, elem::STAR>, 00411 IdxDistributionType, 00412 ValueDistribution > : 00413 public hash_transform_data_t<IdxDistributionType, 00414 ValueDistribution> { 00415 typedef IndexType index_type; 00416 typedef ValueType value_type; 00417 typedef SpDCCols< index_type, value_type > col_t; 00418 typedef FullyDistVec< index_type, value_type> mpi_vector_t; 00419 typedef SpParMat< index_type, value_type, col_t > matrix_type; 00420 typedef elem::DistMatrix< value_type, elem::STAR, elem::STAR > output_matrix_type; 00421 typedef hash_transform_data_t<IdxDistributionType, 00422 ValueDistribution> data_type; 00423 00424 00428 hash_transform_t (int N, int S, base::context_t& context) : 00429 data_type(N, S, context) { 00430 00431 } 00432 00436 template <typename InputMatrixType, 00437 typename OutputMatrixType> 00438 hash_transform_t (hash_transform_t<InputMatrixType, 00439 OutputMatrixType, 00440 IdxDistributionType, 00441 ValueDistribution>& other) : 00442 data_type(other) {} 00443 00447 hash_transform_t (hash_transform_data_t<IdxDistributionType, 00448 ValueDistribution>& other_data) : 00449 data_type(other_data) {} 00450 00451 template <typename Dimension> 00452 void apply (const matrix_type &A, output_matrix_type &sketch_of_A, 00453 Dimension dimension) const { 00454 try { 00455 apply_impl (A, sketch_of_A, dimension); 00456 } catch(boost::mpi::exception e) { 00457 SKYLARK_THROW_EXCEPTION ( 00458 base::mpi_exception() 00459 << base::error_msg(e.what()) ); 00460 } catch (std::string e) { 00461 SKYLARK_THROW_EXCEPTION ( 00462 base::combblas_exception() 00463 << base::error_msg(e) ); 00464 } catch (std::logic_error e) { 00465 SKYLARK_THROW_EXCEPTION ( 00466 base::combblas_exception() 00467 << base::error_msg(e.what()) ); 00468 } catch (std::bad_alloc e) { 00469 SKYLARK_THROW_EXCEPTION ( 00470 base::skylark_exception() 00471 << base::error_msg("bad_alloc: out of memory") ); 00472 } 00473 } 00474 00475 00476 private: 00480 template <typename Dimension> 00481 void apply_impl (const matrix_type &A_, 00482 output_matrix_type &sketch_of_A, 00483 Dimension dist) const { 00484 00485 // We are essentially doing a 'const' access to A, but the necessary, 00486 // 'const' option is missing from the interface 00487 matrix_type &A = const_cast<matrix_type&>(A_); 00488 00489 const size_t rank = A.getcommgrid()->GetRank(); 00490 00491 // extract columns of matrix 00492 col_t &data = A.seq(); 00493 00494 const size_t my_row_offset = utility::cb_my_row_offset(A); 00495 const size_t my_col_offset = utility::cb_my_col_offset(A); 00496 00497 int n_res_cols = A.getncol(); 00498 int n_res_rows = A.getnrow(); 00499 data_type::get_res_size(n_res_rows, n_res_cols, dist); 00500 00501 // Apply sketch for all local values. Subsequently, all values are 00502 // gathered on all processor and the "local" matrix is populated. 00503 typedef std::map<index_type, value_type> col_values_t; 00504 col_values_t col_values; 00505 for(typename col_t::SpColIter col = data.begcol(); 00506 col != data.endcol(); col++) { 00507 for(typename col_t::SpColIter::NzIter nz = data.begnz(col); 00508 nz != data.endnz(col); nz++) { 00509 00510 index_type rowid = nz.rowid() + my_row_offset; 00511 index_type colid = col.colid() + my_col_offset; 00512 00513 const value_type value = 00514 nz.value() * data_type::getValue(rowid, colid, dist); 00515 data_type::finalPos(rowid, colid, dist); 00516 col_values[colid * n_res_rows + rowid] += value; 00517 } 00518 } 00519 00520 std::vector< std::map<index_type, value_type > > result; 00521 boost::mpi::all_gather( 00522 utility::get_communicator(A), col_values, result); 00523 00524 typedef typename std::map<index_type, value_type>::iterator itr_t; 00525 for(size_t i = 0; i < result.size(); ++i) { 00526 itr_t proc_itr = result[i].begin(); 00527 for(; proc_itr != result[i].end(); proc_itr++) { 00528 int row = proc_itr->first % n_res_rows; 00529 int col = proc_itr->first / n_res_rows; 00530 sketch_of_A.Update(row, col, proc_itr->second); 00531 } 00532 } 00533 } 00534 }; 00535 #endif 00536 00537 } } 00539 #endif // SKYLARK_HASH_TRANSFORM_MIXED_HPP