ViennaCL - The Vienna Computing Library
1.5.2
|
00001 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_ 00002 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00025 #include "viennacl/forwards.h" 00026 #include "viennacl/scalar.hpp" 00027 #include "viennacl/vector.hpp" 00028 #include "viennacl/tools/tools.hpp" 00029 #include "viennacl/linalg/cuda/common.hpp" 00030 00031 #include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp" 00032 00033 namespace viennacl 00034 { 00035 namespace linalg 00036 { 00037 namespace cuda 00038 { 00039 // 00040 // Compressed matrix 00041 // 00042 00043 namespace detail 00044 { 00045 00046 template <typename T> 00047 __global__ void csr_row_info_extractor_kernel( 00048 const unsigned int * row_indices, 00049 const unsigned int * column_indices, 00050 const T * elements, 00051 T * result, 00052 unsigned int size, 00053 unsigned int option) 00054 { 00055 for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; 00056 row < size; 00057 row += gridDim.x * blockDim.x) 00058 { 00059 T value = 0; 00060 unsigned int row_end = row_indices[row+1]; 00061 00062 switch (option) 00063 { 00064 case 0: //inf-norm 00065 for (unsigned int i = row_indices[row]; i < row_end; ++i) 00066 value = max(value, fabs(elements[i])); 00067 break; 00068 00069 case 1: //1-norm 00070 for (unsigned int i = row_indices[row]; i < row_end; ++i) 00071 value += fabs(elements[i]); 00072 break; 00073 00074 case 2: //2-norm 00075 for (unsigned int i = row_indices[row]; i < row_end; ++i) 00076 value += elements[i] * elements[i]; 00077 value = sqrt(value); 00078 break; 00079 00080 case 3: //diagonal entry 00081 for (unsigned int i = row_indices[row]; i < row_end; ++i) 00082 { 00083 if (column_indices[i] == row) 00084 { 00085 value = elements[i]; 00086 break; 00087 } 00088 } 00089 break; 00090 00091 default: 00092 break; 00093 } 00094 result[row] = value; 00095 } 00096 } 00097 00098 00099 template<typename ScalarType, unsigned int MAT_ALIGNMENT> 00100 void row_info(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & mat, 00101 vector_base<ScalarType> & vec, 00102 viennacl::linalg::detail::row_info_types info_selector) 00103 { 00104 csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), 00105 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 00106 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00107 detail::cuda_arg<ScalarType>(vec), 00108 static_cast<unsigned int>(mat.size1()), 00109 static_cast<unsigned int>(info_selector) 00110 ); 00111 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel"); 00112 } 00113 00114 } //namespace detail 00115 00116 00117 template <typename T> 00118 __global__ void compressed_matrix_vec_mul_kernel( 00119 const unsigned int * row_indices, 00120 const unsigned int * column_indices, 00121 const T * elements, 00122 const T * x, 00123 unsigned int start_x, 00124 unsigned int inc_x, 00125 T * result, 00126 unsigned int start_result, 00127 unsigned int inc_result, 00128 unsigned int size_result) 00129 { 00130 for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; 00131 row < size_result; 00132 row += gridDim.x * blockDim.x) 00133 { 00134 T dot_prod = (T)0; 00135 unsigned int row_end = row_indices[row+1]; 00136 for (unsigned int i = row_indices[row]; i < row_end; ++i) 00137 dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x]; 00138 result[row * inc_result + start_result] = dot_prod; 00139 } 00140 } 00141 00142 00143 00144 00145 00154 template<class ScalarType, unsigned int ALIGNMENT> 00155 void prod_impl(const viennacl::compressed_matrix<ScalarType, ALIGNMENT> & mat, 00156 const viennacl::vector_base<ScalarType> & vec, 00157 viennacl::vector_base<ScalarType> & result) 00158 { 00159 compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), 00160 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 00161 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00162 detail::cuda_arg<ScalarType>(vec), 00163 static_cast<unsigned int>(vec.start()), 00164 static_cast<unsigned int>(vec.stride()), 00165 detail::cuda_arg<ScalarType>(result), 00166 static_cast<unsigned int>(result.start()), 00167 static_cast<unsigned int>(result.stride()), 00168 static_cast<unsigned int>(result.size()) 00169 ); 00170 VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel"); 00171 } 00172 00177 template <typename LayoutT> 00178 struct mat_mult_matrix_index 00179 { 00180 static __device__ unsigned int apply(unsigned int i, unsigned int j, 00181 unsigned int row_start, unsigned int row_inc, 00182 unsigned int col_start, unsigned int col_inc, 00183 unsigned int internal_rows, unsigned int internal_cols) 00184 { 00185 return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc; 00186 } 00187 }; 00188 00190 template <> 00191 struct mat_mult_matrix_index<viennacl::column_major> 00192 { 00193 static __device__ unsigned int apply(unsigned int i, unsigned int j, 00194 unsigned int row_start, unsigned int row_inc, 00195 unsigned int col_start, unsigned int col_inc, 00196 unsigned int internal_rows, unsigned int internal_cols) 00197 { 00198 return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows; 00199 } 00200 }; 00204 template <typename DMatIndexT, typename ResultIndexT, typename T> 00205 __global__ void compressed_matrix_d_mat_mul_kernel( 00206 const unsigned int * sp_mat_row_indices, 00207 const unsigned int * sp_mat_col_indices, 00208 const T * sp_mat_elements, 00209 const T * d_mat, 00210 unsigned int d_mat_row_start, 00211 unsigned int d_mat_col_start, 00212 unsigned int d_mat_row_inc, 00213 unsigned int d_mat_col_inc, 00214 unsigned int d_mat_row_size, 00215 unsigned int d_mat_col_size, 00216 unsigned int d_mat_internal_rows, 00217 unsigned int d_mat_internal_cols, 00218 T * result, 00219 unsigned int result_row_start, 00220 unsigned int result_col_start, 00221 unsigned int result_row_inc, 00222 unsigned int result_col_inc, 00223 unsigned int result_row_size, 00224 unsigned int result_col_size, 00225 unsigned int result_internal_rows, 00226 unsigned int result_internal_cols) { 00227 00228 for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x) { 00229 00230 unsigned int row_start = sp_mat_row_indices[row]; 00231 unsigned int row_end = sp_mat_row_indices[row+1]; 00232 00233 for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) { 00234 00235 T r = 0; 00236 00237 for (unsigned int k = row_start; k < row_end; k++) { 00238 00239 unsigned int j = sp_mat_col_indices[k]; 00240 T x = sp_mat_elements[k]; 00241 T y = d_mat[ DMatIndexT::apply(j, col, 00242 d_mat_row_start, d_mat_row_inc, 00243 d_mat_col_start, d_mat_col_inc, 00244 d_mat_internal_rows, d_mat_internal_cols) ]; 00245 00246 r += x * y; 00247 } 00248 00249 result [ ResultIndexT::apply(row, col, 00250 result_row_start, result_row_inc, 00251 result_col_start, result_col_inc, 00252 result_internal_rows, result_internal_cols) ] = r; 00253 } 00254 00255 } 00256 00257 } 00258 00259 00268 template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2> 00269 void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat, 00270 const viennacl::matrix_base<TYPE, F1> & d_mat, 00271 viennacl::matrix_base<TYPE, F2> & result) { 00272 compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>> 00273 (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()), 00274 detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()), 00275 detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()), 00276 00277 detail::cuda_arg<TYPE>(d_mat), 00278 static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)), 00279 static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)), 00280 static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)), 00281 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)), 00282 00283 detail::cuda_arg<TYPE>(result), 00284 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 00285 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 00286 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 00287 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 00288 ); 00289 VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel"); 00290 } 00291 00292 00293 template <typename DMatIndexT, typename ResultIndexT, typename T> 00294 __global__ void compressed_matrix_d_tr_mat_mul_kernel( 00295 const unsigned int * sp_mat_row_indices, 00296 const unsigned int * sp_mat_col_indices, 00297 const T * sp_mat_elements, 00298 const T * d_mat, 00299 unsigned int d_mat_row_start, 00300 unsigned int d_mat_col_start, 00301 unsigned int d_mat_row_inc, 00302 unsigned int d_mat_col_inc, 00303 unsigned int d_mat_row_size, 00304 unsigned int d_mat_col_size, 00305 unsigned int d_mat_internal_rows, 00306 unsigned int d_mat_internal_cols, 00307 T * result, 00308 unsigned int result_row_start, 00309 unsigned int result_col_start, 00310 unsigned int result_row_inc, 00311 unsigned int result_col_inc, 00312 unsigned int result_row_size, 00313 unsigned int result_col_size, 00314 unsigned int result_internal_rows, 00315 unsigned int result_internal_cols) { 00316 00317 for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x) { 00318 00319 unsigned int row_start = sp_mat_row_indices[row]; 00320 unsigned int row_end = sp_mat_row_indices[row+1]; 00321 00322 for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) { 00323 00324 T r = 0; 00325 00326 for (unsigned int k = row_start; k < row_end; k++) { 00327 00328 unsigned int j = sp_mat_col_indices[k]; 00329 T x = sp_mat_elements[k]; 00330 T y = d_mat[ DMatIndexT::apply(col, j, 00331 d_mat_row_start, d_mat_row_inc, 00332 d_mat_col_start, d_mat_col_inc, 00333 d_mat_internal_rows, d_mat_internal_cols) ]; 00334 00335 r += x * y; 00336 } 00337 00338 result [ ResultIndexT::apply(row, col, 00339 result_row_start, result_row_inc, 00340 result_col_start, result_col_inc, 00341 result_internal_rows, result_internal_cols) ] = r; 00342 } 00343 } 00344 00345 } 00346 00356 template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2> 00357 void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat, 00358 const viennacl::matrix_expression< const viennacl::matrix_base<TYPE, F1>, 00359 const viennacl::matrix_base<TYPE, F1>, 00360 viennacl::op_trans > & d_mat, 00361 viennacl::matrix_base<TYPE, F2> & result) { 00362 00363 compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>> 00364 (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()), 00365 detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()), 00366 detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()), 00367 00368 detail::cuda_arg<TYPE>(d_mat.lhs()), 00369 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())), 00370 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())), 00371 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())), 00372 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())), 00373 00374 detail::cuda_arg<TYPE>(result), 00375 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 00376 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 00377 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 00378 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 00379 ); 00380 VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel"); 00381 } 00382 00383 00384 // 00385 // triangular solves for compressed_matrix 00386 // 00387 00388 template <typename T> 00389 __global__ void compressed_matrix_diagonal_kernel( 00390 const unsigned int * row_indices, 00391 const unsigned int * column_indices, 00392 const T * elements, 00393 T * result, 00394 unsigned int size) 00395 { 00396 for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x; 00397 row < size; 00398 row += gridDim.x * blockDim.x) 00399 { 00400 T diag = (T)0; 00401 unsigned int row_end = row_indices[row+1]; 00402 for (unsigned int i = row_indices[row]; i < row_end; ++i) 00403 { 00404 unsigned int col_index = column_indices[i]; 00405 if (col_index == row) 00406 { 00407 diag = elements[i]; 00408 break; 00409 } 00410 } 00411 result[row] = diag; 00412 } 00413 } 00414 00415 00421 template<typename SparseMatrixType, class ScalarType> 00422 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00423 inplace_solve(const SparseMatrixType & mat, 00424 viennacl::vector_base<ScalarType> & vec, 00425 viennacl::linalg::unit_lower_tag) 00426 { 00427 csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), 00428 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 00429 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00430 detail::cuda_arg<ScalarType>(vec), 00431 static_cast<unsigned int>(mat.size1()) 00432 ); 00433 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel"); 00434 } 00435 00436 00442 template<typename SparseMatrixType, class ScalarType> 00443 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00444 inplace_solve(const SparseMatrixType & mat, 00445 viennacl::vector_base<ScalarType> & vec, 00446 viennacl::linalg::lower_tag) 00447 { 00448 csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), 00449 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 00450 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00451 detail::cuda_arg<ScalarType>(vec), 00452 static_cast<unsigned int>(mat.size1()) 00453 ); 00454 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel"); 00455 } 00456 00457 00458 00464 template<typename SparseMatrixType, class ScalarType> 00465 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00466 inplace_solve(const SparseMatrixType & mat, 00467 viennacl::vector_base<ScalarType> & vec, 00468 viennacl::linalg::unit_upper_tag) 00469 { 00470 csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), 00471 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 00472 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00473 detail::cuda_arg<ScalarType>(vec), 00474 static_cast<unsigned int>(mat.size1()) 00475 ); 00476 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel"); 00477 } 00478 00479 00485 template<typename SparseMatrixType, class ScalarType> 00486 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00487 inplace_solve(const SparseMatrixType & mat, 00488 viennacl::vector_base<ScalarType> & vec, 00489 viennacl::linalg::upper_tag) 00490 { 00491 csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), 00492 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 00493 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00494 detail::cuda_arg<ScalarType>(vec), 00495 static_cast<unsigned int>(mat.size1()) 00496 ); 00497 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel"); 00498 } 00499 00500 00501 00502 // transposed 00503 00509 template<typename SparseMatrixType, class ScalarType> 00510 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00511 inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat, 00512 viennacl::vector_base<ScalarType> & vec, 00513 viennacl::linalg::unit_lower_tag) 00514 { 00515 csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()), 00516 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()), 00517 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()), 00518 detail::cuda_arg<ScalarType>(vec), 00519 static_cast<unsigned int>(mat.lhs().size1()) 00520 ); 00521 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel"); 00522 } 00523 00524 00530 template<typename SparseMatrixType, class ScalarType> 00531 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00532 inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat, 00533 viennacl::vector_base<ScalarType> & vec, 00534 viennacl::linalg::lower_tag) 00535 { 00536 viennacl::vector<ScalarType> diagonal(vec.size()); 00537 00538 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()), 00539 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()), 00540 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()), 00541 detail::cuda_arg<ScalarType>(diagonal), 00542 static_cast<unsigned int>(mat.size1()) 00543 ); 00544 00545 csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()), 00546 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()), 00547 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()), 00548 detail::cuda_arg<ScalarType>(diagonal), 00549 detail::cuda_arg<ScalarType>(vec), 00550 static_cast<unsigned int>(mat.lhs().size1()) 00551 ); 00552 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel"); 00553 } 00554 00555 00561 template<typename SparseMatrixType, class ScalarType> 00562 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00563 inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat, 00564 viennacl::vector_base<ScalarType> & vec, 00565 viennacl::linalg::unit_upper_tag) 00566 { 00567 csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()), 00568 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()), 00569 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()), 00570 detail::cuda_arg<ScalarType>(vec), 00571 static_cast<unsigned int>(mat.lhs().size1()) 00572 ); 00573 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel"); 00574 } 00575 00576 00582 template<typename SparseMatrixType, class ScalarType> 00583 typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type 00584 inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat, 00585 viennacl::vector_base<ScalarType> & vec, 00586 viennacl::linalg::upper_tag) 00587 { 00588 viennacl::vector<ScalarType> diagonal(vec.size()); 00589 00590 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()), 00591 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()), 00592 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()), 00593 detail::cuda_arg<ScalarType>(diagonal), 00594 static_cast<unsigned int>(mat.size1()) 00595 ); 00596 00597 csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()), 00598 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()), 00599 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()), 00600 detail::cuda_arg<ScalarType>(diagonal), 00601 detail::cuda_arg<ScalarType>(vec), 00602 static_cast<unsigned int>(mat.lhs().size1()) 00603 ); 00604 VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel"); 00605 } 00606 00607 namespace detail 00608 { 00609 // 00610 // block solves 00611 // 00612 template<typename ScalarType, unsigned int MAT_ALIGNMENT> 00613 void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>, 00614 const compressed_matrix<ScalarType, MAT_ALIGNMENT>, 00615 op_trans> & L, 00616 viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks, 00617 vector_base<ScalarType> const & /* L_diagonal */, //ignored 00618 vector_base<ScalarType> & vec, 00619 viennacl::linalg::unit_lower_tag) 00620 { 00621 csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()), 00622 detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()), 00623 detail::cuda_arg<ScalarType>(L.lhs().handle().cuda_handle()), 00624 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()), 00625 detail::cuda_arg<ScalarType>(vec), 00626 static_cast<unsigned int>(L.lhs().size1()) 00627 ); 00628 } 00629 00630 00631 template<typename ScalarType, unsigned int MAT_ALIGNMENT> 00632 void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>, 00633 const compressed_matrix<ScalarType, MAT_ALIGNMENT>, 00634 op_trans> & U, 00635 viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks, 00636 vector_base<ScalarType> const & U_diagonal, 00637 vector_base<ScalarType> & vec, 00638 viennacl::linalg::upper_tag) 00639 { 00640 csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()), 00641 detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()), 00642 detail::cuda_arg<ScalarType>(U.lhs().handle().cuda_handle()), 00643 detail::cuda_arg<ScalarType>(U_diagonal.handle().cuda_handle()), 00644 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()), 00645 detail::cuda_arg<ScalarType>(vec), 00646 static_cast<unsigned int>(U.lhs().size1()) 00647 ); 00648 } 00649 00650 00651 } 00652 00653 00654 // 00655 // Compressed Compressed Matrix 00656 // 00657 00658 template <typename T> 00659 __global__ void compressed_compressed_matrix_vec_mul_kernel( 00660 const unsigned int * row_jumper, 00661 const unsigned int * row_indices, 00662 const unsigned int * column_indices, 00663 const T * elements, 00664 unsigned int nonzero_rows, 00665 const T * x, 00666 unsigned int start_x, 00667 unsigned int inc_x, 00668 T * result, 00669 unsigned int start_result, 00670 unsigned int inc_result, 00671 unsigned int size_result) 00672 { 00673 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00674 i < size_result; 00675 i += gridDim.x * blockDim.x) 00676 { 00677 result[i * inc_result + start_result] = 0; 00678 } 00679 00680 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00681 i < nonzero_rows; 00682 i += gridDim.x * blockDim.x) 00683 { 00684 T dot_prod = (T)0; 00685 unsigned int row_end = row_jumper[i+1]; 00686 for (unsigned int j = row_jumper[i]; j < row_end; ++j) 00687 dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x]; 00688 result[row_indices[i] * inc_result + start_result] = dot_prod; 00689 } 00690 } 00691 00692 00701 template<class ScalarType> 00702 void prod_impl(const viennacl::compressed_compressed_matrix<ScalarType> & mat, 00703 const viennacl::vector_base<ScalarType> & vec, 00704 viennacl::vector_base<ScalarType> & result) 00705 { 00706 compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()), 00707 detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()), 00708 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 00709 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00710 static_cast<unsigned int>(mat.nnz1()), 00711 detail::cuda_arg<ScalarType>(vec), 00712 static_cast<unsigned int>(vec.start()), 00713 static_cast<unsigned int>(vec.stride()), 00714 detail::cuda_arg<ScalarType>(result), 00715 static_cast<unsigned int>(result.start()), 00716 static_cast<unsigned int>(result.stride()), 00717 static_cast<unsigned int>(result.size()) 00718 ); 00719 VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel"); 00720 } 00721 00722 // 00723 // Coordinate Matrix 00724 // 00725 00726 00727 namespace detail 00728 { 00729 00730 template <typename T> 00731 __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index) 00732 const T * elements, 00733 const unsigned int * group_boundaries, 00734 T * result, 00735 unsigned int option) 00736 { 00737 __shared__ unsigned int shared_rows[128]; 00738 __shared__ T inter_results[128]; 00739 00740 uint2 tmp; 00741 T val; 00742 unsigned int last_index = blockDim.x - 1; 00743 unsigned int group_start = group_boundaries[blockIdx.x]; 00744 unsigned int group_end = group_boundaries[blockIdx.x + 1]; 00745 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x 00746 00747 unsigned int local_index = 0; 00748 00749 for (unsigned int k = 0; k < k_end; ++k) 00750 { 00751 local_index = group_start + k * blockDim.x + threadIdx.x; 00752 00753 tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); 00754 val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; 00755 00756 //check for carry from previous loop run: 00757 if (threadIdx.x == 0 && k > 0) 00758 { 00759 if (tmp.x == shared_rows[last_index]) 00760 { 00761 switch (option) 00762 { 00763 case 0: //inf-norm 00764 case 3: //diagonal entry 00765 val = max(val, fabs(inter_results[last_index])); 00766 break; 00767 00768 case 1: //1-norm 00769 val = fabs(val) + inter_results[last_index]; 00770 break; 00771 00772 case 2: //2-norm 00773 val = sqrt(val * val + inter_results[last_index]); 00774 break; 00775 00776 default: 00777 break; 00778 } 00779 } 00780 else 00781 { 00782 switch (option) 00783 { 00784 case 0: //inf-norm 00785 case 1: //1-norm 00786 case 3: //diagonal entry 00787 result[shared_rows[last_index]] = inter_results[last_index]; 00788 break; 00789 00790 case 2: //2-norm 00791 result[shared_rows[last_index]] = sqrt(inter_results[last_index]); 00792 default: 00793 break; 00794 } 00795 } 00796 } 00797 00798 //segmented parallel reduction begin 00799 __syncthreads(); 00800 shared_rows[threadIdx.x] = tmp.x; 00801 switch (option) 00802 { 00803 case 0: 00804 case 3: 00805 inter_results[threadIdx.x] = val; 00806 break; 00807 case 1: 00808 inter_results[threadIdx.x] = fabs(val); 00809 break; 00810 case 2: 00811 inter_results[threadIdx.x] = val * val; 00812 default: 00813 break; 00814 } 00815 T left = 0; 00816 __syncthreads(); 00817 00818 for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) 00819 { 00820 left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; 00821 __syncthreads(); 00822 switch (option) 00823 { 00824 case 0: //inf-norm 00825 case 3: //diagonal entry 00826 inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left); 00827 break; 00828 00829 case 1: //1-norm 00830 inter_results[threadIdx.x] += left; 00831 break; 00832 00833 case 2: //2-norm 00834 inter_results[threadIdx.x] += left; 00835 break; 00836 00837 default: 00838 break; 00839 } 00840 __syncthreads(); 00841 } 00842 //segmented parallel reduction end 00843 00844 if (threadIdx.x != last_index && 00845 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] && 00846 inter_results[threadIdx.x] != 0) 00847 { 00848 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x]; 00849 } 00850 00851 __syncthreads(); 00852 } //for k 00853 00854 if (threadIdx.x == last_index && inter_results[last_index] != 0) 00855 result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index]; 00856 } 00857 00858 template<typename ScalarType, unsigned int MAT_ALIGNMENT> 00859 void row_info(coordinate_matrix<ScalarType, MAT_ALIGNMENT> const & mat, 00860 vector_base<ScalarType> & vec, 00861 viennacl::linalg::detail::row_info_types info_selector) 00862 { 00863 coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()), 00864 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00865 detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()), 00866 detail::cuda_arg<ScalarType>(vec), 00867 static_cast<unsigned int>(info_selector) 00868 ); 00869 VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor"); 00870 } 00871 00872 } //namespace detail 00873 00874 00875 template <typename T> 00876 __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index) 00877 const T * elements, 00878 const unsigned int * group_boundaries, 00879 const T * x, 00880 unsigned int start_x, 00881 unsigned int inc_x, 00882 T * result, 00883 unsigned int start_result, 00884 unsigned int inc_result 00885 ) 00886 { 00887 __shared__ unsigned int shared_rows[128]; 00888 __shared__ T inter_results[128]; 00889 00890 uint2 tmp; 00891 T val; 00892 unsigned int group_start = group_boundaries[blockIdx.x]; 00893 unsigned int group_end = group_boundaries[blockIdx.x + 1]; 00894 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x 00895 00896 unsigned int local_index = 0; 00897 00898 for (unsigned int k = 0; k < k_end; ++k) 00899 { 00900 local_index = group_start + k * blockDim.x + threadIdx.x; 00901 00902 tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); 00903 val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0; 00904 00905 //check for carry from previous loop run: 00906 if (threadIdx.x == 0 && k > 0) 00907 { 00908 if (tmp.x == shared_rows[blockDim.x-1]) 00909 val += inter_results[blockDim.x-1]; 00910 else 00911 result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1]; 00912 } 00913 00914 //segmented parallel reduction begin 00915 __syncthreads(); 00916 shared_rows[threadIdx.x] = tmp.x; 00917 inter_results[threadIdx.x] = val; 00918 T left = 0; 00919 __syncthreads(); 00920 00921 for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) 00922 { 00923 left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; 00924 __syncthreads(); 00925 inter_results[threadIdx.x] += left; 00926 __syncthreads(); 00927 } 00928 //segmented parallel reduction end 00929 00930 if (local_index < group_end && threadIdx.x < blockDim.x-1 && 00931 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1]) 00932 { 00933 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x]; 00934 } 00935 00936 __syncthreads(); 00937 } //for k 00938 00939 if (local_index + 1 == group_end) 00940 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x]; 00941 } 00942 00943 00952 template<class ScalarType, unsigned int ALIGNMENT> 00953 void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & mat, 00954 const viennacl::vector_base<ScalarType> & vec, 00955 viennacl::vector_base<ScalarType> & result) 00956 { 00957 result.clear(); 00958 00959 coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()), 00960 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 00961 detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()), 00962 detail::cuda_arg<ScalarType>(vec), 00963 static_cast<unsigned int>(vec.start()), 00964 static_cast<unsigned int>(vec.stride()), 00965 detail::cuda_arg<ScalarType>(result), 00966 static_cast<unsigned int>(result.start()), 00967 static_cast<unsigned int>(result.stride()) 00968 ); 00969 VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel"); 00970 } 00971 00972 00973 00974 00975 template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT> 00976 __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index) 00977 const ScalarType * elements, 00978 const unsigned int * group_boundaries, 00979 const NumericT * d_mat, 00980 unsigned int d_mat_row_start, 00981 unsigned int d_mat_col_start, 00982 unsigned int d_mat_row_inc, 00983 unsigned int d_mat_col_inc, 00984 unsigned int d_mat_row_size, 00985 unsigned int d_mat_col_size, 00986 unsigned int d_mat_internal_rows, 00987 unsigned int d_mat_internal_cols, 00988 NumericT * result, 00989 unsigned int result_row_start, 00990 unsigned int result_col_start, 00991 unsigned int result_row_inc, 00992 unsigned int result_col_inc, 00993 unsigned int result_row_size, 00994 unsigned int result_col_size, 00995 unsigned int result_internal_rows, 00996 unsigned int result_internal_cols) 00997 { 00998 __shared__ unsigned int shared_rows[128]; 00999 __shared__ NumericT inter_results[128]; 01000 01001 uint2 tmp; 01002 NumericT val; 01003 unsigned int group_start = group_boundaries[blockIdx.x]; 01004 unsigned int group_end = group_boundaries[blockIdx.x + 1]; 01005 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x 01006 01007 unsigned int local_index = 0; 01008 01009 for (unsigned int result_col = 0; result_col < result_col_size; ++result_col) 01010 { 01011 for (unsigned int k = 0; k < k_end; ++k) 01012 { 01013 local_index = group_start + k * blockDim.x + threadIdx.x; 01014 01015 tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); 01016 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col, 01017 d_mat_row_start, d_mat_row_inc, 01018 d_mat_col_start, d_mat_col_inc, 01019 d_mat_internal_rows, d_mat_internal_cols) ] : 0; 01020 01021 //check for carry from previous loop run: 01022 if (threadIdx.x == 0 && k > 0) 01023 { 01024 if (tmp.x == shared_rows[blockDim.x-1]) 01025 val += inter_results[blockDim.x-1]; 01026 else 01027 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col, 01028 result_row_start, result_row_inc, 01029 result_col_start, result_col_inc, 01030 result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1]; 01031 } 01032 01033 //segmented parallel reduction begin 01034 __syncthreads(); 01035 shared_rows[threadIdx.x] = tmp.x; 01036 inter_results[threadIdx.x] = val; 01037 NumericT left = 0; 01038 __syncthreads(); 01039 01040 for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) 01041 { 01042 left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; 01043 __syncthreads(); 01044 inter_results[threadIdx.x] += left; 01045 __syncthreads(); 01046 } 01047 //segmented parallel reduction end 01048 01049 if (local_index < group_end && threadIdx.x < blockDim.x-1 && 01050 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1]) 01051 { 01052 result[ResultIndexT::apply(tmp.x, result_col, 01053 result_row_start, result_row_inc, 01054 result_col_start, result_col_inc, 01055 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x]; 01056 } 01057 01058 __syncthreads(); 01059 } //for k 01060 01061 if (local_index + 1 == group_end) 01062 result[ResultIndexT::apply(tmp.x, result_col, 01063 result_row_start, result_row_inc, 01064 result_col_start, result_col_inc, 01065 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x]; 01066 } 01067 } 01068 01069 01078 template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2> 01079 void prod_impl(const viennacl::coordinate_matrix<NumericT, ALIGNMENT> & sp_mat, 01080 const viennacl::matrix_base<NumericT, F1> & d_mat, 01081 viennacl::matrix_base<NumericT, F2> & result) { 01082 01083 coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>> 01084 (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()), 01085 detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()), 01086 detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()), 01087 01088 detail::cuda_arg<NumericT>(d_mat), 01089 static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)), 01090 static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)), 01091 static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)), 01092 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)), 01093 01094 detail::cuda_arg<NumericT>(result), 01095 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 01096 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 01097 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 01098 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 01099 ); 01100 01101 } 01102 01103 template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT> 01104 __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index) 01105 const ScalarType * elements, 01106 const unsigned int * group_boundaries, 01107 const NumericT * d_mat, 01108 unsigned int d_mat_row_start, 01109 unsigned int d_mat_col_start, 01110 unsigned int d_mat_row_inc, 01111 unsigned int d_mat_col_inc, 01112 unsigned int d_mat_row_size, 01113 unsigned int d_mat_col_size, 01114 unsigned int d_mat_internal_rows, 01115 unsigned int d_mat_internal_cols, 01116 NumericT * result, 01117 unsigned int result_row_start, 01118 unsigned int result_col_start, 01119 unsigned int result_row_inc, 01120 unsigned int result_col_inc, 01121 unsigned int result_row_size, 01122 unsigned int result_col_size, 01123 unsigned int result_internal_rows, 01124 unsigned int result_internal_cols) 01125 { 01126 __shared__ unsigned int shared_rows[128]; 01127 __shared__ NumericT inter_results[128]; 01128 01129 uint2 tmp; 01130 NumericT val; 01131 unsigned int group_start = group_boundaries[blockIdx.x]; 01132 unsigned int group_end = group_boundaries[blockIdx.x + 1]; 01133 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x 01134 01135 unsigned int local_index = 0; 01136 01137 for (unsigned int result_col = 0; result_col < result_col_size; ++result_col) 01138 { 01139 for (unsigned int k = 0; k < k_end; ++k) 01140 { 01141 local_index = group_start + k * blockDim.x + threadIdx.x; 01142 01143 tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0); 01144 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y, 01145 d_mat_row_start, d_mat_row_inc, 01146 d_mat_col_start, d_mat_col_inc, 01147 d_mat_internal_rows, d_mat_internal_cols)] : 0; 01148 01149 //check for carry from previous loop run: 01150 if (threadIdx.x == 0 && k > 0) 01151 { 01152 if (tmp.x == shared_rows[blockDim.x-1]) 01153 val += inter_results[blockDim.x-1]; 01154 else 01155 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col, 01156 result_row_start, result_row_inc, 01157 result_col_start, result_col_inc, 01158 result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1]; 01159 } 01160 01161 //segmented parallel reduction begin 01162 __syncthreads(); 01163 shared_rows[threadIdx.x] = tmp.x; 01164 inter_results[threadIdx.x] = val; 01165 NumericT left = 0; 01166 __syncthreads(); 01167 01168 for (unsigned int stride = 1; stride < blockDim.x; stride *= 2) 01169 { 01170 left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0; 01171 __syncthreads(); 01172 inter_results[threadIdx.x] += left; 01173 __syncthreads(); 01174 } 01175 //segmented parallel reduction end 01176 01177 if (local_index < group_end && threadIdx.x < blockDim.x-1 && 01178 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1]) 01179 { 01180 result[ ResultIndexT::apply(tmp.x, result_col, 01181 result_row_start, result_row_inc, 01182 result_col_start, result_col_inc, 01183 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x]; 01184 } 01185 01186 __syncthreads(); 01187 } //for k 01188 01189 if (local_index + 1 == group_end) 01190 result[ ResultIndexT::apply(tmp.x, result_col, 01191 result_row_start, result_row_inc, 01192 result_col_start, result_col_inc, 01193 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x]; 01194 } 01195 } 01196 01205 template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2> 01206 void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & sp_mat, 01207 const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>, 01208 const viennacl::matrix_base<NumericT, F1>, 01209 viennacl::op_trans > & d_mat, 01210 viennacl::matrix_base<NumericT, F2> & result) { 01211 01212 coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>> 01213 (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()), 01214 detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()), 01215 detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()), 01216 01217 detail::cuda_arg<NumericT>(d_mat.lhs()), 01218 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())), 01219 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())), 01220 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())), 01221 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())), 01222 01223 detail::cuda_arg<NumericT>(result), 01224 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 01225 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 01226 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 01227 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 01228 ); 01229 01230 VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel"); 01231 } 01232 01233 01234 // 01235 // ELL Matrix 01236 // 01237 01238 template <typename T> 01239 __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords, 01240 const T * elements, 01241 const T * x, 01242 unsigned int start_x, 01243 unsigned int inc_x, 01244 T * result, 01245 unsigned int start_result, 01246 unsigned int inc_result, 01247 unsigned int row_num, 01248 unsigned int col_num, 01249 unsigned int internal_row_num, 01250 unsigned int items_per_row, 01251 unsigned int aligned_items_per_row 01252 ) 01253 { 01254 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; 01255 unsigned int glb_sz = gridDim.x * blockDim.x; 01256 01257 for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz) 01258 { 01259 T sum = 0; 01260 01261 unsigned int offset = row_id; 01262 for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) 01263 { 01264 T val = elements[offset]; 01265 01266 if(val != (T)0) 01267 { 01268 int col = coords[offset]; 01269 sum += (x[col * inc_x + start_x] * val); 01270 } 01271 } 01272 01273 result[row_id * inc_result + start_result] = sum; 01274 } 01275 } 01276 01277 01286 template<class ScalarType, unsigned int ALIGNMENT> 01287 void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & mat, 01288 const viennacl::vector_base<ScalarType> & vec, 01289 viennacl::vector_base<ScalarType> & result) 01290 { 01291 ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 01292 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 01293 detail::cuda_arg<ScalarType>(vec), 01294 static_cast<unsigned int>(vec.start()), 01295 static_cast<unsigned int>(vec.stride()), 01296 detail::cuda_arg<ScalarType>(result), 01297 static_cast<unsigned int>(result.start()), 01298 static_cast<unsigned int>(result.stride()), 01299 static_cast<unsigned int>(mat.size1()), 01300 static_cast<unsigned int>(mat.size2()), 01301 static_cast<unsigned int>(mat.internal_size1()), 01302 static_cast<unsigned int>(mat.maxnnz()), 01303 static_cast<unsigned int>(mat.internal_maxnnz()) 01304 ); 01305 VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel"); 01306 } 01307 01308 template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT > 01309 __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords, 01310 const ScalarType * sp_mat_elements, 01311 unsigned int sp_mat_row_num, 01312 unsigned int sp_mat_col_num, 01313 unsigned int sp_mat_internal_row_num, 01314 unsigned int sp_mat_items_per_row, 01315 unsigned int sp_mat_aligned_items_per_row, 01316 const NumericT * d_mat, 01317 unsigned int d_mat_row_start, 01318 unsigned int d_mat_col_start, 01319 unsigned int d_mat_row_inc, 01320 unsigned int d_mat_col_inc, 01321 unsigned int d_mat_row_size, 01322 unsigned int d_mat_col_size, 01323 unsigned int d_mat_internal_rows, 01324 unsigned int d_mat_internal_cols, 01325 NumericT * result, 01326 unsigned int result_row_start, 01327 unsigned int result_col_start, 01328 unsigned int result_row_inc, 01329 unsigned int result_col_inc, 01330 unsigned int result_row_size, 01331 unsigned int result_col_size, 01332 unsigned int result_internal_rows, 01333 unsigned int result_internal_cols) { 01334 01335 01336 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; 01337 unsigned int glb_sz = gridDim.x * blockDim.x; 01338 for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz) { 01339 unsigned int row = rc % sp_mat_row_num; 01340 unsigned int col = rc / sp_mat_row_num; 01341 01342 unsigned int offset = row; 01343 NumericT r = (NumericT)0; 01344 01345 for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) { 01346 01347 unsigned int j = sp_mat_coords[offset]; 01348 NumericT x = static_cast<NumericT>(sp_mat_elements[offset]); 01349 01350 if(x != (NumericT)0) { 01351 01352 NumericT y = d_mat[ DMatIndexT::apply(j, col, 01353 d_mat_row_start, d_mat_row_inc, 01354 d_mat_col_start, d_mat_col_inc, 01355 d_mat_internal_rows, d_mat_internal_cols) ]; 01356 01357 r += x*y; 01358 } 01359 } 01360 result [ ResultIndexT::apply(row, col, 01361 result_row_start, result_row_inc, 01362 result_col_start, result_col_inc, 01363 result_internal_rows, result_internal_cols) ] = r; 01364 } 01365 01366 } 01367 01377 template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 > 01378 void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat, 01379 const viennacl::matrix_base<NumericT, F1> & d_mat, 01380 viennacl::matrix_base<NumericT, F2> & result) { 01381 01382 ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>> 01383 (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()), 01384 detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()), 01385 static_cast<unsigned int>(sp_mat.size1()), 01386 static_cast<unsigned int>(sp_mat.size2()), 01387 static_cast<unsigned int>(sp_mat.internal_size1()), 01388 static_cast<unsigned int>(sp_mat.maxnnz()), 01389 static_cast<unsigned int>(sp_mat.internal_maxnnz()), 01390 detail::cuda_arg<NumericT>(d_mat), 01391 static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)), 01392 static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)), 01393 static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)), 01394 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)), 01395 01396 detail::cuda_arg<NumericT>(result), 01397 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 01398 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 01399 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 01400 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 01401 ); 01402 VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel"); 01403 } 01404 01405 template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT > 01406 __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords, 01407 const ScalarType * sp_mat_elements, 01408 unsigned int sp_mat_row_num, 01409 unsigned int sp_mat_col_num, 01410 unsigned int sp_mat_internal_row_num, 01411 unsigned int sp_mat_items_per_row, 01412 unsigned int sp_mat_aligned_items_per_row, 01413 const NumericT * d_mat, 01414 unsigned int d_mat_row_start, 01415 unsigned int d_mat_col_start, 01416 unsigned int d_mat_row_inc, 01417 unsigned int d_mat_col_inc, 01418 unsigned int d_mat_row_size, 01419 unsigned int d_mat_col_size, 01420 unsigned int d_mat_internal_rows, 01421 unsigned int d_mat_internal_cols, 01422 NumericT * result, 01423 unsigned int result_row_start, 01424 unsigned int result_col_start, 01425 unsigned int result_row_inc, 01426 unsigned int result_col_inc, 01427 unsigned int result_row_size, 01428 unsigned int result_col_size, 01429 unsigned int result_internal_rows, 01430 unsigned int result_internal_cols) { 01431 01432 01433 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; 01434 unsigned int glb_sz = gridDim.x * blockDim.x; 01435 for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz) { 01436 unsigned int row = rc % sp_mat_row_num; 01437 unsigned int col = rc / sp_mat_row_num; 01438 01439 unsigned int offset = row; 01440 NumericT r = (NumericT)0; 01441 01442 for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) { 01443 01444 unsigned int j = sp_mat_coords[offset]; 01445 NumericT x = static_cast<NumericT>(sp_mat_elements[offset]); 01446 01447 if(x != (NumericT)0) { 01448 01449 NumericT y = d_mat[ DMatIndexT::apply(col, j, 01450 d_mat_row_start, d_mat_row_inc, 01451 d_mat_col_start, d_mat_col_inc, 01452 d_mat_internal_rows, d_mat_internal_cols) ]; 01453 01454 r += x*y; 01455 } 01456 } 01457 result [ ResultIndexT::apply(row, col, 01458 result_row_start, result_row_inc, 01459 result_col_start, result_col_inc, 01460 result_internal_rows, result_internal_cols) ] = r; 01461 } 01462 01463 } 01464 01474 template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 > 01475 void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat, 01476 const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>, 01477 const viennacl::matrix_base<NumericT, F1>, 01478 viennacl::op_trans > & d_mat, 01479 viennacl::matrix_base<NumericT, F2> & result) { 01480 01481 ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>> 01482 (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()), 01483 detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()), 01484 static_cast<unsigned int>(sp_mat.size1()), 01485 static_cast<unsigned int>(sp_mat.size2()), 01486 static_cast<unsigned int>(sp_mat.internal_size1()), 01487 static_cast<unsigned int>(sp_mat.maxnnz()), 01488 static_cast<unsigned int>(sp_mat.internal_maxnnz()), 01489 01490 detail::cuda_arg<NumericT>(d_mat.lhs()), 01491 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())), 01492 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())), 01493 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())), 01494 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())), 01495 01496 detail::cuda_arg<NumericT>(result), 01497 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 01498 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 01499 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 01500 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 01501 ); 01502 01503 VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel"); 01504 } 01505 01506 // 01507 // Hybrid Matrix 01508 // 01509 01510 01511 template <typename T> 01512 __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords, 01513 const T * ell_elements, 01514 const unsigned int * csr_rows, 01515 const unsigned int * csr_cols, 01516 const T * csr_elements, 01517 const T * x, 01518 unsigned int start_x, 01519 unsigned int inc_x, 01520 T * result, 01521 unsigned int start_result, 01522 unsigned int inc_result, 01523 unsigned int row_num, 01524 unsigned int internal_row_num, 01525 unsigned int items_per_row, 01526 unsigned int aligned_items_per_row 01527 ) 01528 { 01529 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; 01530 unsigned int glb_sz = gridDim.x * blockDim.x; 01531 01532 for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz) 01533 { 01534 T sum = 0; 01535 01536 unsigned int offset = row_id; 01537 for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) 01538 { 01539 T val = ell_elements[offset]; 01540 01541 01542 if(val != 0.0f) 01543 { 01544 int col = ell_coords[offset]; 01545 sum += (x[col * inc_x + start_x] * val); 01546 } 01547 } 01548 01549 unsigned int col_begin = csr_rows[row_id]; 01550 unsigned int col_end = csr_rows[row_id + 1]; 01551 01552 for(unsigned int item_id = col_begin; item_id < col_end; item_id++) 01553 { 01554 sum += (x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id]); 01555 } 01556 01557 result[row_id * inc_result + start_result] = sum; 01558 } 01559 } 01560 01561 01562 01571 template<class ScalarType, unsigned int ALIGNMENT> 01572 void prod_impl(const viennacl::hyb_matrix<ScalarType, ALIGNMENT> & mat, 01573 const viennacl::vector_base<ScalarType> & vec, 01574 viennacl::vector_base<ScalarType> & result) 01575 { 01576 hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 01577 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()), 01578 detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()), 01579 detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()), 01580 detail::cuda_arg<ScalarType>(mat.handle5().cuda_handle()), 01581 detail::cuda_arg<ScalarType>(vec), 01582 static_cast<unsigned int>(vec.start()), 01583 static_cast<unsigned int>(vec.stride()), 01584 detail::cuda_arg<ScalarType>(result), 01585 static_cast<unsigned int>(result.start()), 01586 static_cast<unsigned int>(result.stride()), 01587 static_cast<unsigned int>(mat.size1()), 01588 static_cast<unsigned int>(mat.internal_size1()), 01589 static_cast<unsigned int>(mat.ell_nnz()), 01590 static_cast<unsigned int>(mat.internal_ellnnz()) 01591 ); 01592 VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel"); 01593 } 01594 01595 01596 01597 template <typename DMatIndexT, typename ResultIndexT, typename NumericT> 01598 __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords, 01599 const NumericT * ell_elements, 01600 const unsigned int * csr_rows, 01601 const unsigned int * csr_cols, 01602 const NumericT * csr_elements, 01603 unsigned int row_num, 01604 unsigned int internal_row_num, 01605 unsigned int items_per_row, 01606 unsigned int aligned_items_per_row, 01607 const NumericT * d_mat, 01608 unsigned int d_mat_row_start, 01609 unsigned int d_mat_col_start, 01610 unsigned int d_mat_row_inc, 01611 unsigned int d_mat_col_inc, 01612 unsigned int d_mat_row_size, 01613 unsigned int d_mat_col_size, 01614 unsigned int d_mat_internal_rows, 01615 unsigned int d_mat_internal_cols, 01616 NumericT * result, 01617 unsigned int result_row_start, 01618 unsigned int result_col_start, 01619 unsigned int result_row_inc, 01620 unsigned int result_col_inc, 01621 unsigned int result_row_size, 01622 unsigned int result_col_size, 01623 unsigned int result_internal_rows, 01624 unsigned int result_internal_cols) 01625 { 01626 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; 01627 unsigned int glb_sz = gridDim.x * blockDim.x; 01628 01629 for(unsigned int result_col = 0; result_col < result_col_size; ++result_col) 01630 { 01631 for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz) 01632 { 01633 NumericT sum = 0; 01634 01635 unsigned int offset = row_id; 01636 for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) 01637 { 01638 NumericT val = ell_elements[offset]; 01639 01640 if(val != 0.0f) 01641 { 01642 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col, 01643 d_mat_row_start, d_mat_row_inc, 01644 d_mat_col_start, d_mat_col_inc, 01645 d_mat_internal_rows, d_mat_internal_cols)] * val; 01646 } 01647 } 01648 01649 unsigned int col_begin = csr_rows[row_id]; 01650 unsigned int col_end = csr_rows[row_id + 1]; 01651 01652 for(unsigned int item_id = col_begin; item_id < col_end; item_id++) 01653 { 01654 sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col, 01655 d_mat_row_start, d_mat_row_inc, 01656 d_mat_col_start, d_mat_col_inc, 01657 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id]; 01658 } 01659 01660 result[ResultIndexT::apply(row_id, result_col, 01661 result_row_start, result_row_inc, 01662 result_col_start, result_col_inc, 01663 result_internal_rows, result_internal_cols)] = sum; 01664 } 01665 } 01666 } 01667 01668 01669 01678 template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2> 01679 void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat, 01680 const viennacl::matrix_base<NumericT, F1> & d_mat, 01681 viennacl::matrix_base<NumericT, F2> & result) 01682 { 01683 hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>( 01684 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 01685 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()), 01686 detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()), 01687 detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()), 01688 detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()), 01689 static_cast<unsigned int>(mat.size1()), 01690 static_cast<unsigned int>(mat.internal_size1()), 01691 static_cast<unsigned int>(mat.ell_nnz()), 01692 static_cast<unsigned int>(mat.internal_ellnnz()), 01693 01694 detail::cuda_arg<NumericT>(d_mat), 01695 static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)), 01696 static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)), 01697 static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)), 01698 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)), 01699 01700 detail::cuda_arg<NumericT>(result), 01701 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 01702 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 01703 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 01704 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 01705 ); 01706 VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel"); 01707 } 01708 01709 01710 01711 template <typename DMatIndexT, typename ResultIndexT, typename NumericT> 01712 __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords, 01713 const NumericT * ell_elements, 01714 const unsigned int * csr_rows, 01715 const unsigned int * csr_cols, 01716 const NumericT * csr_elements, 01717 unsigned int row_num, 01718 unsigned int internal_row_num, 01719 unsigned int items_per_row, 01720 unsigned int aligned_items_per_row, 01721 const NumericT * d_mat, 01722 unsigned int d_mat_row_start, 01723 unsigned int d_mat_col_start, 01724 unsigned int d_mat_row_inc, 01725 unsigned int d_mat_col_inc, 01726 unsigned int d_mat_row_size, 01727 unsigned int d_mat_col_size, 01728 unsigned int d_mat_internal_rows, 01729 unsigned int d_mat_internal_cols, 01730 NumericT * result, 01731 unsigned int result_row_start, 01732 unsigned int result_col_start, 01733 unsigned int result_row_inc, 01734 unsigned int result_col_inc, 01735 unsigned int result_row_size, 01736 unsigned int result_col_size, 01737 unsigned int result_internal_rows, 01738 unsigned int result_internal_cols) 01739 { 01740 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x; 01741 unsigned int glb_sz = gridDim.x * blockDim.x; 01742 01743 for(unsigned int result_col = 0; result_col < result_col_size; ++result_col) 01744 { 01745 for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz) 01746 { 01747 NumericT sum = 0; 01748 01749 unsigned int offset = row_id; 01750 for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) 01751 { 01752 NumericT val = ell_elements[offset]; 01753 01754 if(val != 0.0f) 01755 { 01756 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset], 01757 d_mat_row_start, d_mat_row_inc, 01758 d_mat_col_start, d_mat_col_inc, 01759 d_mat_internal_rows, d_mat_internal_cols)] * val; 01760 } 01761 } 01762 01763 unsigned int col_begin = csr_rows[row_id]; 01764 unsigned int col_end = csr_rows[row_id + 1]; 01765 01766 for(unsigned int item_id = col_begin; item_id < col_end; item_id++) 01767 { 01768 sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id], 01769 d_mat_row_start, d_mat_row_inc, 01770 d_mat_col_start, d_mat_col_inc, 01771 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id]; 01772 } 01773 01774 result[ResultIndexT::apply(row_id, result_col, 01775 result_row_start, result_row_inc, 01776 result_col_start, result_col_inc, 01777 result_internal_rows, result_internal_cols)] = sum; 01778 } 01779 } 01780 } 01781 01782 01783 01792 template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2> 01793 void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat, 01794 const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>, 01795 const viennacl::matrix_base<NumericT, F1>, 01796 viennacl::op_trans > & d_mat, 01797 viennacl::matrix_base<NumericT, F2> & result) 01798 { 01799 hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>( 01800 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()), 01801 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()), 01802 detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()), 01803 detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()), 01804 detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()), 01805 static_cast<unsigned int>(mat.size1()), 01806 static_cast<unsigned int>(mat.internal_size1()), 01807 static_cast<unsigned int>(mat.ell_nnz()), 01808 static_cast<unsigned int>(mat.internal_ellnnz()), 01809 01810 detail::cuda_arg<NumericT>(d_mat.lhs()), 01811 static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())), 01812 static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())), 01813 static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())), 01814 static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())), 01815 01816 detail::cuda_arg<NumericT>(result), 01817 static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)), 01818 static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)), 01819 static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)), 01820 static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result)) 01821 ); 01822 VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel"); 01823 } 01824 01825 01826 } // namespace cuda 01827 } //namespace linalg 01828 } //namespace viennacl 01829 01830 01831 #endif