ViennaCL - The Vienna Computing Library
1.5.2
|
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_ 00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00025 #include "viennacl/forwards.h" 00026 #include "viennacl/scalar.hpp" 00027 #include "viennacl/vector.hpp" 00028 #include "viennacl/vector_proxy.hpp" 00029 #include "viennacl/tools/tools.hpp" 00030 #include "viennacl/meta/enable_if.hpp" 00031 #include "viennacl/meta/predicate.hpp" 00032 #include "viennacl/meta/result_of.hpp" 00033 #include "viennacl/traits/size.hpp" 00034 #include "viennacl/traits/start.hpp" 00035 #include "viennacl/traits/handle.hpp" 00036 #include "viennacl/traits/stride.hpp" 00037 00038 #include "viennacl/linalg/cuda/common.hpp" 00039 00040 #include "viennacl/linalg/cuda/vector_operations.hpp" 00041 #include "viennacl/linalg/cuda/matrix_operations_row.hpp" 00042 #include "viennacl/linalg/cuda/matrix_operations_col.hpp" 00043 #include "viennacl/linalg/cuda/matrix_operations_prod.hpp" 00044 #include "viennacl/linalg/cuda/matrix_operations_prod.hpp" 00045 00046 namespace viennacl 00047 { 00048 namespace linalg 00049 { 00050 namespace cuda 00051 { 00052 // 00053 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! 00054 // 00055 00056 template <typename NumericT, typename F, 00057 typename ScalarType1> 00058 void am(matrix_base<NumericT, F> & mat1, 00059 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) 00060 { 00061 typedef NumericT value_type; 00062 00063 unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00064 00065 value_type temporary_alpha = 0; 00066 if (viennacl::is_cpu_scalar<ScalarType1>::value) 00067 temporary_alpha = alpha; 00068 00069 if (viennacl::is_row_major<F>::value) 00070 { 00071 am_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 00072 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 00073 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 00074 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 00075 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 00076 00077 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00078 options_alpha, 00079 detail::cuda_arg<value_type>(mat2), 00080 static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), 00081 static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), 00082 static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)) 00083 ); 00084 VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel"); 00085 } 00086 else 00087 { 00088 am_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 00089 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 00090 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 00091 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 00092 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 00093 00094 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00095 options_alpha, 00096 detail::cuda_arg<value_type>(mat2), 00097 static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), 00098 static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), 00099 static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)) 00100 ); 00101 VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel"); 00102 } 00103 } 00104 00105 00106 template <typename NumericT, typename F, 00107 typename ScalarType1, typename ScalarType2> 00108 void ambm(matrix_base<NumericT, F> & mat1, 00109 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 00110 matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) 00111 { 00112 typedef NumericT value_type; 00113 00114 unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00115 00116 value_type temporary_alpha = 0; 00117 if (viennacl::is_cpu_scalar<ScalarType1>::value) 00118 temporary_alpha = alpha; 00119 00120 00121 unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); 00122 00123 value_type temporary_beta = 0; 00124 if (viennacl::is_cpu_scalar<ScalarType2>::value) 00125 temporary_beta = beta; 00126 00127 00128 if (viennacl::is_row_major<F>::value) 00129 { 00130 ambm_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 00131 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 00132 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 00133 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 00134 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 00135 00136 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00137 options_alpha, 00138 detail::cuda_arg<value_type>(mat2), 00139 static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), 00140 static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), 00141 static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), 00142 00143 detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), 00144 options_beta, 00145 detail::cuda_arg<value_type>(mat3), 00146 static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), 00147 static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), 00148 static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) 00149 ); 00150 VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel"); 00151 } 00152 else 00153 { 00154 ambm_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 00155 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 00156 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 00157 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 00158 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 00159 00160 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00161 options_alpha, 00162 detail::cuda_arg<value_type>(mat2), 00163 static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), 00164 static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), 00165 static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), 00166 00167 detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), 00168 options_beta, 00169 detail::cuda_arg<value_type>(mat3), 00170 static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), 00171 static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), 00172 static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) 00173 ); 00174 VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel"); 00175 } 00176 00177 } 00178 00179 00180 template <typename NumericT, typename F, 00181 typename ScalarType1, typename ScalarType2> 00182 void ambm_m(matrix_base<NumericT, F> & mat1, 00183 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 00184 matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) 00185 { 00186 typedef NumericT value_type; 00187 00188 unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00189 00190 value_type temporary_alpha = 0; 00191 if (viennacl::is_cpu_scalar<ScalarType1>::value) 00192 temporary_alpha = alpha; 00193 00194 00195 unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); 00196 00197 value_type temporary_beta = 0; 00198 if (viennacl::is_cpu_scalar<ScalarType2>::value) 00199 temporary_beta = beta; 00200 00201 00202 if (viennacl::is_row_major<F>::value) 00203 { 00204 ambm_m_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 00205 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 00206 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 00207 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 00208 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 00209 00210 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00211 options_alpha, 00212 detail::cuda_arg<value_type>(mat2), 00213 static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), 00214 static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), 00215 static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), 00216 00217 detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), 00218 options_beta, 00219 detail::cuda_arg<value_type>(mat3), 00220 static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), 00221 static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), 00222 static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) 00223 ); 00224 VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel"); 00225 } 00226 else 00227 { 00228 ambm_m_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 00229 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 00230 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 00231 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 00232 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 00233 00234 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00235 options_alpha, 00236 detail::cuda_arg<value_type>(mat2), 00237 static_cast<unsigned int>(viennacl::traits::start1(mat2)), static_cast<unsigned int>(viennacl::traits::start2(mat2)), 00238 static_cast<unsigned int>(viennacl::traits::stride1(mat2)), static_cast<unsigned int>(viennacl::traits::stride2(mat2)), 00239 static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)), 00240 00241 detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), 00242 options_beta, 00243 detail::cuda_arg<value_type>(mat3), 00244 static_cast<unsigned int>(viennacl::traits::start1(mat3)), static_cast<unsigned int>(viennacl::traits::start2(mat3)), 00245 static_cast<unsigned int>(viennacl::traits::stride1(mat3)), static_cast<unsigned int>(viennacl::traits::stride2(mat3)), 00246 static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat3)) 00247 ); 00248 VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel"); 00249 } 00250 00251 } 00252 00253 00254 00255 00256 template <typename NumericT, typename F> 00257 void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false) 00258 { 00259 typedef NumericT value_type; 00260 value_type alpha = s; 00261 00262 unsigned int s1 = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat); 00263 unsigned int s2 = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat); 00264 00265 if (viennacl::is_row_major<F>::value) 00266 { 00267 00268 matrix_row_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat), 00269 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 00270 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 00271 s1, s2, 00272 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 00273 alpha); 00274 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel"); 00275 } 00276 else 00277 { 00278 matrix_col_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat), 00279 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 00280 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 00281 s1, s2, 00282 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 00283 alpha); 00284 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel"); 00285 } 00286 } 00287 00288 template <typename NumericT, typename F> 00289 void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s) 00290 { 00291 typedef NumericT value_type; 00292 value_type alpha = s; 00293 00294 if (viennacl::is_row_major<F>::value) 00295 { 00296 matrix_row_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat), 00297 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 00298 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 00299 static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), 00300 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 00301 alpha); 00302 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel"); 00303 } 00304 else 00305 { 00306 matrix_col_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat), 00307 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 00308 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 00309 static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), 00310 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 00311 alpha); 00312 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel"); 00313 } 00314 } 00315 00316 00317 template <typename NumericT, typename F> 00318 void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT, F> & mat) 00319 { 00320 typedef NumericT value_type; 00321 00322 // Step 1: assign zero matrix: 00323 matrix_assign(mat, NumericT(0)); 00324 00325 // Step 2: Assign diagonal: 00326 unsigned int options_alpha = 0; 00327 00328 vcl_size_t mat_start = 0; 00329 vcl_size_t mat_stride = 0; 00330 vcl_size_t mat_size = viennacl::traits::size(vec); 00331 if (viennacl::is_row_major<F>::value) 00332 { 00333 vcl_size_t first_row_index = 0; 00334 vcl_size_t first_col_index = 0; 00335 if (k < 0) 00336 first_row_index = vcl_size_t(-k); 00337 else 00338 first_col_index = vcl_size_t(k); 00339 mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) 00340 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat); 00341 mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat); 00342 } 00343 else 00344 { 00345 vcl_size_t first_row_index = 0; 00346 vcl_size_t first_col_index = 0; 00347 if (k < 0) 00348 first_row_index = vcl_size_t(-k); 00349 else 00350 first_col_index = vcl_size_t(k); 00351 mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat) 00352 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat); 00353 mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat); 00354 } 00355 00356 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat), 00357 static_cast<unsigned int>(mat_start), 00358 static_cast<unsigned int>(mat_stride), 00359 static_cast<unsigned int>(mat_size), 00360 00361 detail::cuda_arg<value_type>(NumericT(1)), 00362 options_alpha, 00363 detail::cuda_arg<value_type>(vec), 00364 static_cast<unsigned int>(viennacl::traits::start(vec)), 00365 static_cast<unsigned int>(viennacl::traits::stride(vec)) ); 00366 VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); 00367 } 00368 00369 template <typename NumericT, typename F> 00370 void matrix_diag_to_vector(const matrix_base<NumericT, F> & mat, int k, vector_base<NumericT> & vec) 00371 { 00372 typedef NumericT value_type; 00373 00374 unsigned int options_alpha = 0; 00375 00376 vcl_size_t mat_start = 0; 00377 vcl_size_t mat_stride = 0; 00378 if (viennacl::is_row_major<F>::value) 00379 { 00380 vcl_size_t first_row_index = 0; 00381 vcl_size_t first_col_index = 0; 00382 if (k < 0) 00383 first_row_index = vcl_size_t(-k); 00384 else 00385 first_col_index = vcl_size_t(k); 00386 mat_start = (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) 00387 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat); 00388 mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat); 00389 } 00390 else 00391 { 00392 vcl_size_t first_row_index = 0; 00393 vcl_size_t first_col_index = 0; 00394 if (k < 0) 00395 first_row_index = vcl_size_t(-k); 00396 else 00397 first_col_index = vcl_size_t(k); 00398 mat_start = viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat) 00399 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat); 00400 mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat); 00401 } 00402 00403 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec), 00404 static_cast<unsigned int>(viennacl::traits::start(vec)), 00405 static_cast<unsigned int>(viennacl::traits::stride(vec)), 00406 static_cast<unsigned int>(viennacl::traits::size(vec)), 00407 00408 detail::cuda_arg<value_type>(NumericT(1)), 00409 options_alpha, 00410 detail::cuda_arg<value_type>(mat), 00411 static_cast<unsigned int>(mat_start), 00412 static_cast<unsigned int>(mat_stride)); 00413 VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); 00414 } 00415 00416 template <typename NumericT, typename F> 00417 void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec) 00418 { 00419 typedef NumericT value_type; 00420 00421 unsigned int options_alpha = 0; 00422 00423 vcl_size_t mat_start = 0; 00424 vcl_size_t mat_stride = 0; 00425 if (viennacl::is_row_major<F>::value) 00426 { 00427 mat_start = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat); 00428 mat_stride = viennacl::traits::stride2(mat); 00429 } 00430 else 00431 { 00432 mat_start = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat); 00433 mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat); 00434 } 00435 00436 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec), 00437 static_cast<unsigned int>(viennacl::traits::start(vec)), 00438 static_cast<unsigned int>(viennacl::traits::stride(vec)), 00439 static_cast<unsigned int>(viennacl::traits::size(vec)), 00440 00441 detail::cuda_arg<value_type>(NumericT(1)), 00442 options_alpha, 00443 detail::cuda_arg<value_type>(mat), 00444 static_cast<unsigned int>(mat_start), 00445 static_cast<unsigned int>(mat_stride)); 00446 VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); 00447 } 00448 00449 template <typename NumericT, typename F> 00450 void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec) 00451 { 00452 typedef NumericT value_type; 00453 00454 unsigned int options_alpha = 0; 00455 00456 vcl_size_t mat_start = 0; 00457 vcl_size_t mat_stride = 0; 00458 if (viennacl::is_row_major<F>::value) 00459 { 00460 mat_start = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat); 00461 mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat); 00462 } 00463 else 00464 { 00465 mat_start = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat); 00466 mat_stride = viennacl::traits::stride2(mat); 00467 } 00468 00469 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec), 00470 static_cast<unsigned int>(viennacl::traits::start(vec)), 00471 static_cast<unsigned int>(viennacl::traits::stride(vec)), 00472 static_cast<unsigned int>(viennacl::traits::size(vec)), 00473 00474 detail::cuda_arg<value_type>(NumericT(1)), 00475 options_alpha, 00476 detail::cuda_arg<value_type>(mat), 00477 static_cast<unsigned int>(mat_start), 00478 static_cast<unsigned int>(mat_stride)); 00479 VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); 00480 } 00481 00482 00483 // 00485 // 00486 00487 00488 template <typename T, typename F, typename OP> 00489 void element_op(matrix_base<T, F> & A, 00490 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy) 00491 { 00492 typedef T value_type; 00493 00494 unsigned int op_type = 2; //0: product, 1: division, 2: power 00495 if (viennacl::is_division<OP>::value) 00496 op_type = 1; 00497 else if (viennacl::is_product<OP>::value) 00498 op_type = 0; 00499 00500 if (viennacl::is_row_major<F>::value) 00501 { 00502 element_op_int_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00503 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00504 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00505 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00506 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00507 00508 detail::cuda_arg<value_type>(proxy.lhs()), 00509 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00510 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00511 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), 00512 00513 detail::cuda_arg<value_type>(proxy.rhs()), 00514 static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), 00515 static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), 00516 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), 00517 00518 op_type 00519 ); 00520 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel"); 00521 } 00522 else 00523 { 00524 element_op_int_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00525 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00526 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00527 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00528 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00529 00530 detail::cuda_arg<value_type>(proxy.lhs()), 00531 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00532 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00533 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), 00534 00535 detail::cuda_arg<value_type>(proxy.rhs()), 00536 static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), 00537 static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), 00538 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), 00539 00540 op_type 00541 ); 00542 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel"); 00543 } 00544 } 00545 00546 template <typename F, typename OP> 00547 void element_op(matrix_base<float, F> & A, 00548 matrix_expression<const matrix_base<float, F>, const matrix_base<float, F>, op_element_binary<OP> > const & proxy) 00549 { 00550 typedef float value_type; 00551 00552 unsigned int op_type = 2; //0: product, 1: division, 2: power 00553 if (viennacl::is_division<OP>::value) 00554 op_type = 1; 00555 else if (viennacl::is_product<OP>::value) 00556 op_type = 0; 00557 00558 if (viennacl::is_row_major<F>::value) 00559 { 00560 element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00561 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00562 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00563 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00564 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00565 00566 detail::cuda_arg<value_type>(proxy.lhs()), 00567 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00568 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00569 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), 00570 00571 detail::cuda_arg<value_type>(proxy.rhs()), 00572 static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), 00573 static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), 00574 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), 00575 00576 op_type 00577 ); 00578 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel"); 00579 } 00580 else 00581 { 00582 element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00583 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00584 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00585 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00586 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00587 00588 detail::cuda_arg<value_type>(proxy.lhs()), 00589 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00590 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00591 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), 00592 00593 detail::cuda_arg<value_type>(proxy.rhs()), 00594 static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), 00595 static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), 00596 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), 00597 00598 op_type 00599 ); 00600 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel"); 00601 } 00602 } 00603 00604 template <typename F, typename OP> 00605 void element_op(matrix_base<double, F> & A, 00606 matrix_expression<const matrix_base<double, F>, const matrix_base<double, F>, op_element_binary<OP> > const & proxy) 00607 { 00608 typedef double value_type; 00609 00610 unsigned int op_type = 2; //0: product, 1: division, 2: power 00611 if (viennacl::is_division<OP>::value) 00612 op_type = 1; 00613 else if (viennacl::is_product<OP>::value) 00614 op_type = 0; 00615 00616 if (viennacl::is_row_major<F>::value) 00617 { 00618 element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00619 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00620 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00621 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00622 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00623 00624 detail::cuda_arg<value_type>(proxy.lhs()), 00625 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00626 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00627 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), 00628 00629 detail::cuda_arg<value_type>(proxy.rhs()), 00630 static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), 00631 static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), 00632 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), 00633 00634 op_type 00635 ); 00636 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel"); 00637 } 00638 else 00639 { 00640 element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00641 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00642 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00643 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00644 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00645 00646 detail::cuda_arg<value_type>(proxy.lhs()), 00647 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00648 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00649 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())), 00650 00651 detail::cuda_arg<value_type>(proxy.rhs()), 00652 static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())), 00653 static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())), 00654 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())), 00655 00656 op_type 00657 ); 00658 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel"); 00659 } 00660 } 00661 00662 // 00664 // 00665 00666 // Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2), 00667 // we could not find a more 'automatic' way of generating the overloads below... 00668 00669 // abs 00670 template <typename T, typename F> 00671 void element_op(matrix_base<T, F> & A, 00672 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_abs> > const & proxy) 00673 { 00674 typedef T value_type; 00675 00676 if (viennacl::is_row_major<F>::value) 00677 { 00678 matrix_row_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00679 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00680 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00681 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00682 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00683 00684 detail::cuda_arg<value_type>(proxy.lhs()), 00685 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00686 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00687 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00688 ); 00689 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel"); 00690 } 00691 else 00692 { 00693 matrix_col_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00694 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00695 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00696 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00697 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00698 00699 detail::cuda_arg<value_type>(proxy.lhs()), 00700 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00701 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00702 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00703 ); 00704 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel"); 00705 } 00706 } 00707 00708 00709 // acos 00710 template <typename T, typename F> 00711 void element_op(matrix_base<T, F> & A, 00712 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_acos> > const & proxy) 00713 { 00714 typedef T value_type; 00715 00716 if (viennacl::is_row_major<F>::value) 00717 { 00718 matrix_row_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00719 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00720 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00721 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00722 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00723 00724 detail::cuda_arg<value_type>(proxy.lhs()), 00725 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00726 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00727 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00728 ); 00729 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel"); 00730 } 00731 else 00732 { 00733 matrix_col_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00734 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00735 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00736 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00737 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00738 00739 detail::cuda_arg<value_type>(proxy.lhs()), 00740 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00741 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00742 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00743 ); 00744 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel"); 00745 } 00746 } 00747 00748 00749 // asin 00750 template <typename T, typename F> 00751 void element_op(matrix_base<T, F> & A, 00752 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_asin> > const & proxy) 00753 { 00754 typedef T value_type; 00755 00756 if (viennacl::is_row_major<F>::value) 00757 { 00758 matrix_row_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00759 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00760 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00761 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00762 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00763 00764 detail::cuda_arg<value_type>(proxy.lhs()), 00765 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00766 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00767 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00768 ); 00769 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel"); 00770 } 00771 else 00772 { 00773 matrix_col_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00774 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00775 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00776 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00777 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00778 00779 detail::cuda_arg<value_type>(proxy.lhs()), 00780 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00781 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00782 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00783 ); 00784 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel"); 00785 } 00786 } 00787 00788 00789 // atan 00790 template <typename T, typename F> 00791 void element_op(matrix_base<T, F> & A, 00792 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_atan> > const & proxy) 00793 { 00794 typedef T value_type; 00795 00796 if (viennacl::is_row_major<F>::value) 00797 { 00798 matrix_row_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00799 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00800 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00801 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00802 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00803 00804 detail::cuda_arg<value_type>(proxy.lhs()), 00805 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00806 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00807 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00808 ); 00809 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel"); 00810 } 00811 else 00812 { 00813 matrix_col_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00814 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00815 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00816 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00817 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00818 00819 detail::cuda_arg<value_type>(proxy.lhs()), 00820 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00821 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00822 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00823 ); 00824 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel"); 00825 } 00826 } 00827 00828 00829 // ceil 00830 template <typename T, typename F> 00831 void element_op(matrix_base<T, F> & A, 00832 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_ceil> > const & proxy) 00833 { 00834 typedef T value_type; 00835 00836 if (viennacl::is_row_major<F>::value) 00837 { 00838 matrix_row_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00839 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00840 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00841 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00842 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00843 00844 detail::cuda_arg<value_type>(proxy.lhs()), 00845 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00846 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00847 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00848 ); 00849 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel"); 00850 } 00851 else 00852 { 00853 matrix_col_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00854 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00855 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00856 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00857 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00858 00859 detail::cuda_arg<value_type>(proxy.lhs()), 00860 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00861 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00862 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00863 ); 00864 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel"); 00865 } 00866 } 00867 00868 00869 // cos 00870 template <typename T, typename F> 00871 void element_op(matrix_base<T, F> & A, 00872 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_cos> > const & proxy) 00873 { 00874 typedef T value_type; 00875 00876 if (viennacl::is_row_major<F>::value) 00877 { 00878 matrix_row_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00879 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00880 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00881 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00882 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00883 00884 detail::cuda_arg<value_type>(proxy.lhs()), 00885 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00886 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00887 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00888 ); 00889 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel"); 00890 } 00891 else 00892 { 00893 matrix_col_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00894 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00895 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00896 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00897 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00898 00899 detail::cuda_arg<value_type>(proxy.lhs()), 00900 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00901 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00902 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00903 ); 00904 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel"); 00905 } 00906 } 00907 00908 00909 // cosh 00910 template <typename T, typename F> 00911 void element_op(matrix_base<T, F> & A, 00912 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_cosh> > const & proxy) 00913 { 00914 typedef T value_type; 00915 00916 if (viennacl::is_row_major<F>::value) 00917 { 00918 matrix_row_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00919 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00920 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00921 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00922 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00923 00924 detail::cuda_arg<value_type>(proxy.lhs()), 00925 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00926 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00927 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00928 ); 00929 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel"); 00930 } 00931 else 00932 { 00933 matrix_col_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00934 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00935 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00936 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00937 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00938 00939 detail::cuda_arg<value_type>(proxy.lhs()), 00940 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00941 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00942 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00943 ); 00944 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel"); 00945 } 00946 } 00947 00948 00949 // exp 00950 template <typename T, typename F> 00951 void element_op(matrix_base<T, F> & A, 00952 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_exp> > const & proxy) 00953 { 00954 typedef T value_type; 00955 00956 if (viennacl::is_row_major<F>::value) 00957 { 00958 matrix_row_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00959 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00960 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00961 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00962 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00963 00964 detail::cuda_arg<value_type>(proxy.lhs()), 00965 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00966 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00967 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00968 ); 00969 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel"); 00970 } 00971 else 00972 { 00973 matrix_col_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00974 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 00975 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 00976 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 00977 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 00978 00979 detail::cuda_arg<value_type>(proxy.lhs()), 00980 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 00981 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 00982 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 00983 ); 00984 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel"); 00985 } 00986 } 00987 00988 00989 // fabs 00990 template <typename T, typename F> 00991 void element_op(matrix_base<T, F> & A, 00992 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_fabs> > const & proxy) 00993 { 00994 typedef T value_type; 00995 00996 if (viennacl::is_row_major<F>::value) 00997 { 00998 matrix_row_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 00999 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01000 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01001 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01002 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01003 01004 detail::cuda_arg<value_type>(proxy.lhs()), 01005 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01006 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01007 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01008 ); 01009 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel"); 01010 } 01011 else 01012 { 01013 matrix_col_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01014 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01015 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01016 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01017 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01018 01019 detail::cuda_arg<value_type>(proxy.lhs()), 01020 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01021 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01022 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01023 ); 01024 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel"); 01025 } 01026 } 01027 01028 01029 // floor 01030 template <typename T, typename F> 01031 void element_op(matrix_base<T, F> & A, 01032 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_floor> > const & proxy) 01033 { 01034 typedef T value_type; 01035 01036 if (viennacl::is_row_major<F>::value) 01037 { 01038 matrix_row_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01039 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01040 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01041 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01042 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01043 01044 detail::cuda_arg<value_type>(proxy.lhs()), 01045 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01046 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01047 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01048 ); 01049 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel"); 01050 } 01051 else 01052 { 01053 matrix_col_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01054 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01055 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01056 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01057 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01058 01059 detail::cuda_arg<value_type>(proxy.lhs()), 01060 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01061 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01062 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01063 ); 01064 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel"); 01065 } 01066 } 01067 01068 01069 // log 01070 template <typename T, typename F> 01071 void element_op(matrix_base<T, F> & A, 01072 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_log> > const & proxy) 01073 { 01074 typedef T value_type; 01075 01076 if (viennacl::is_row_major<F>::value) 01077 { 01078 matrix_row_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01079 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01080 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01081 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01082 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01083 01084 detail::cuda_arg<value_type>(proxy.lhs()), 01085 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01086 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01087 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01088 ); 01089 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel"); 01090 } 01091 else 01092 { 01093 matrix_col_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01094 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01095 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01096 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01097 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01098 01099 detail::cuda_arg<value_type>(proxy.lhs()), 01100 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01101 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01102 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01103 ); 01104 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel"); 01105 } 01106 } 01107 01108 01109 // log10 01110 template <typename T, typename F> 01111 void element_op(matrix_base<T, F> & A, 01112 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_log10> > const & proxy) 01113 { 01114 typedef T value_type; 01115 01116 if (viennacl::is_row_major<F>::value) 01117 { 01118 matrix_row_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01119 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01120 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01121 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01122 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01123 01124 detail::cuda_arg<value_type>(proxy.lhs()), 01125 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01126 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01127 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01128 ); 01129 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel"); 01130 } 01131 else 01132 { 01133 matrix_col_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01134 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01135 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01136 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01137 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01138 01139 detail::cuda_arg<value_type>(proxy.lhs()), 01140 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01141 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01142 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01143 ); 01144 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel"); 01145 } 01146 } 01147 01148 01149 // sin 01150 template <typename T, typename F> 01151 void element_op(matrix_base<T, F> & A, 01152 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sin> > const & proxy) 01153 { 01154 typedef T value_type; 01155 01156 if (viennacl::is_row_major<F>::value) 01157 { 01158 matrix_row_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01159 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01160 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01161 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01162 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01163 01164 detail::cuda_arg<value_type>(proxy.lhs()), 01165 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01166 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01167 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01168 ); 01169 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel"); 01170 } 01171 else 01172 { 01173 matrix_col_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01174 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01175 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01176 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01177 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01178 01179 detail::cuda_arg<value_type>(proxy.lhs()), 01180 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01181 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01182 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01183 ); 01184 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel"); 01185 } 01186 } 01187 01188 01189 // sinh 01190 template <typename T, typename F> 01191 void element_op(matrix_base<T, F> & A, 01192 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sinh> > const & proxy) 01193 { 01194 typedef T value_type; 01195 01196 if (viennacl::is_row_major<F>::value) 01197 { 01198 matrix_row_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01199 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01200 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01201 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01202 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01203 01204 detail::cuda_arg<value_type>(proxy.lhs()), 01205 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01206 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01207 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01208 ); 01209 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel"); 01210 } 01211 else 01212 { 01213 matrix_col_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01214 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01215 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01216 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01217 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01218 01219 detail::cuda_arg<value_type>(proxy.lhs()), 01220 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01221 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01222 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01223 ); 01224 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel"); 01225 } 01226 } 01227 01228 01229 // sqrt 01230 template <typename T, typename F> 01231 void element_op(matrix_base<T, F> & A, 01232 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sqrt> > const & proxy) 01233 { 01234 typedef T value_type; 01235 01236 if (viennacl::is_row_major<F>::value) 01237 { 01238 matrix_row_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01239 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01240 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01241 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01242 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01243 01244 detail::cuda_arg<value_type>(proxy.lhs()), 01245 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01246 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01247 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01248 ); 01249 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sqrt_kernel"); 01250 } 01251 else 01252 { 01253 matrix_col_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01254 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01255 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01256 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01257 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01258 01259 detail::cuda_arg<value_type>(proxy.lhs()), 01260 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01261 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01262 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01263 ); 01264 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sqrt_kernel"); 01265 } 01266 } 01267 01268 01269 // tan 01270 template <typename T, typename F> 01271 void element_op(matrix_base<T, F> & A, 01272 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_tan> > const & proxy) 01273 { 01274 typedef T value_type; 01275 01276 if (viennacl::is_row_major<F>::value) 01277 { 01278 matrix_row_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01279 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01280 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01281 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01282 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01283 01284 detail::cuda_arg<value_type>(proxy.lhs()), 01285 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01286 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01287 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01288 ); 01289 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tan_kernel"); 01290 } 01291 else 01292 { 01293 matrix_col_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01294 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01295 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01296 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01297 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01298 01299 detail::cuda_arg<value_type>(proxy.lhs()), 01300 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01301 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01302 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01303 ); 01304 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tan_kernel"); 01305 } 01306 } 01307 01308 01309 // tanh 01310 template <typename T, typename F> 01311 void element_op(matrix_base<T, F> & A, 01312 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_tanh> > const & proxy) 01313 { 01314 typedef T value_type; 01315 01316 if (viennacl::is_row_major<F>::value) 01317 { 01318 matrix_row_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01319 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01320 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01321 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01322 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01323 01324 detail::cuda_arg<value_type>(proxy.lhs()), 01325 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01326 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01327 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01328 ); 01329 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tanh_kernel"); 01330 } 01331 else 01332 { 01333 matrix_col_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A), 01334 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01335 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01336 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01337 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01338 01339 detail::cuda_arg<value_type>(proxy.lhs()), 01340 static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())), 01341 static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())), 01342 static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())) 01343 ); 01344 VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tanh_kernel"); 01345 } 01346 } 01347 01348 01349 // 01351 // 01352 01353 // A * x 01354 01363 template <typename NumericT, typename F> 01364 void prod_impl(const matrix_base<NumericT, F> & mat, 01365 const vector_base<NumericT> & vec, 01366 vector_base<NumericT> & result) 01367 { 01368 typedef NumericT value_type; 01369 01370 assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!")); 01371 01372 if (viennacl::is_row_major<F>::value) 01373 { 01374 vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat), 01375 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 01376 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 01377 static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), 01378 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 01379 01380 detail::cuda_arg<value_type>(vec), 01381 static_cast<unsigned int>(viennacl::traits::start(vec)), 01382 static_cast<unsigned int>(viennacl::traits::stride(vec)), 01383 static_cast<unsigned int>(viennacl::traits::size(vec)), 01384 01385 detail::cuda_arg<value_type>(result), 01386 static_cast<unsigned int>(viennacl::traits::start(result)), 01387 static_cast<unsigned int>(viennacl::traits::stride(result)), 01388 static_cast<unsigned int>(viennacl::traits::size(result)) 01389 ); 01390 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_row_kernel"); 01391 } 01392 else 01393 { 01394 vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat), 01395 static_cast<unsigned int>(viennacl::traits::start1(mat)), static_cast<unsigned int>(viennacl::traits::start2(mat)), 01396 static_cast<unsigned int>(viennacl::traits::stride1(mat)), static_cast<unsigned int>(viennacl::traits::stride2(mat)), 01397 static_cast<unsigned int>(viennacl::traits::size1(mat)), static_cast<unsigned int>(viennacl::traits::size2(mat)), 01398 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)), 01399 01400 detail::cuda_arg<value_type>(vec), 01401 static_cast<unsigned int>(viennacl::traits::start(vec)), 01402 static_cast<unsigned int>(viennacl::traits::stride(vec)), 01403 static_cast<unsigned int>(viennacl::traits::size(vec)), 01404 01405 detail::cuda_arg<value_type>(result), 01406 static_cast<unsigned int>(viennacl::traits::start(result)), 01407 static_cast<unsigned int>(viennacl::traits::stride(result)), 01408 static_cast<unsigned int>(viennacl::traits::size(result)) 01409 ); 01410 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_col_kernel"); 01411 } 01412 } 01413 01414 01415 // trans(A) * x 01416 01425 template <typename NumericT, typename F> 01426 void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans, 01427 const vector_base<NumericT> & vec, 01428 vector_base<NumericT> & result) 01429 { 01430 assert( (viennacl::traits::size1(mat_trans) == viennacl::traits::size(result)) && bool("Size check failed for transposed matrix-vector product: size1(A^T) == size(result)")); 01431 assert( (viennacl::traits::size2(mat_trans) == viennacl::traits::size(vec)) && bool("Size check failed for transposed matrix-vector product: size2(A^T) == size(x)")); //remember: mat is transposed! 01432 01433 typedef NumericT value_type; 01434 01435 01436 // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead 01437 assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!")); 01438 01439 if (viennacl::is_row_major<F>::value) 01440 { 01441 trans_vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()), 01442 static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())), 01443 static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())), 01444 static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())), 01445 static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())), 01446 01447 detail::cuda_arg<value_type>(vec), 01448 static_cast<unsigned int>(viennacl::traits::start(vec)), 01449 static_cast<unsigned int>(viennacl::traits::stride(vec)), 01450 static_cast<unsigned int>(viennacl::traits::size(vec)), 01451 01452 detail::cuda_arg<value_type>(result), 01453 static_cast<unsigned int>(viennacl::traits::start(result)), 01454 static_cast<unsigned int>(viennacl::traits::stride(result)), 01455 static_cast<unsigned int>(viennacl::traits::size(result)) 01456 ); 01457 VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_row_kernel"); 01458 } 01459 else 01460 { 01461 trans_vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()), 01462 static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())), 01463 static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())), 01464 static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())), 01465 static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())), 01466 01467 detail::cuda_arg<value_type>(vec), 01468 static_cast<unsigned int>(viennacl::traits::start(vec)), 01469 static_cast<unsigned int>(viennacl::traits::stride(vec)), 01470 static_cast<unsigned int>(viennacl::traits::size(vec)), 01471 01472 detail::cuda_arg<value_type>(result), 01473 static_cast<unsigned int>(viennacl::traits::start(result)), 01474 static_cast<unsigned int>(viennacl::traits::stride(result)), 01475 static_cast<unsigned int>(viennacl::traits::size(result)) 01476 ); 01477 VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_col_kernel"); 01478 } 01479 } 01480 01481 01482 // 01484 // 01485 01486 namespace detail 01487 { 01488 // C = A * B and possibly transposed variants 01489 template <typename T1, typename T2, typename T3, typename ScalarType > 01490 void prod_slow_kernel(const T1 & A, bool transposed_A, 01491 const T2 & B, bool transposed_B, 01492 T3 & C, 01493 ScalarType alpha, 01494 ScalarType beta) 01495 { 01496 typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type cpu_value_type; 01497 01498 cpu_value_type converted_alpha = static_cast<cpu_value_type>(alpha); 01499 cpu_value_type converted_beta = static_cast<cpu_value_type>(beta); 01500 01501 dim3 threads(16, 16); 01502 dim3 grid( (viennacl::traits::size1(C) - 1) / 16 + 1, 01503 (viennacl::traits::size2(C) - 1) / 16 + 1); 01504 01505 bool row_major_A = viennacl::is_row_major<T1>::value; 01506 bool row_major_B = viennacl::is_row_major<T2>::value; 01507 bool row_major_C = viennacl::is_row_major<T3>::value; 01508 01509 01510 if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B) 01511 { 01512 matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>> 01513 (converted_alpha, 01514 detail::cuda_arg<cpu_value_type>(A), 01515 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01516 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01517 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01518 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01519 01520 detail::cuda_arg<cpu_value_type>(B), 01521 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01522 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01523 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01524 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01525 01526 converted_beta, 01527 detail::cuda_arg<cpu_value_type>(C), 01528 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01529 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01530 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01531 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01532 } 01533 else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B) 01534 { 01535 matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>> 01536 (converted_alpha, 01537 detail::cuda_arg<cpu_value_type>(A), 01538 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01539 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01540 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01541 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01542 01543 detail::cuda_arg<cpu_value_type>(B), 01544 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01545 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01546 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01547 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01548 01549 converted_beta, 01550 detail::cuda_arg<cpu_value_type>(C), 01551 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01552 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01553 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01554 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01555 } 01556 else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B) 01557 { 01558 matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>> 01559 (converted_alpha, 01560 detail::cuda_arg<cpu_value_type>(A), 01561 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01562 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01563 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01564 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01565 01566 detail::cuda_arg<cpu_value_type>(B), 01567 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01568 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01569 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01570 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01571 01572 converted_beta, 01573 detail::cuda_arg<cpu_value_type>(C), 01574 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01575 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01576 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01577 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01578 } 01579 else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B) 01580 { 01581 matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>> 01582 (converted_alpha, 01583 detail::cuda_arg<cpu_value_type>(A), 01584 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01585 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01586 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01587 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01588 01589 detail::cuda_arg<cpu_value_type>(B), 01590 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01591 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01592 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01593 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01594 01595 converted_beta, 01596 detail::cuda_arg<cpu_value_type>(C), 01597 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01598 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01599 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01600 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01601 } 01603 01604 else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B) 01605 { 01606 matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>> 01607 (converted_alpha, 01608 detail::cuda_arg<cpu_value_type>(A), 01609 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01610 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01611 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01612 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01613 01614 detail::cuda_arg<cpu_value_type>(B), 01615 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01616 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01617 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01618 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01619 01620 converted_beta, 01621 detail::cuda_arg<cpu_value_type>(C), 01622 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01623 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01624 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01625 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01626 } 01627 else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B) 01628 { 01629 matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>> 01630 (converted_alpha, 01631 detail::cuda_arg<cpu_value_type>(A), 01632 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01633 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01634 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01635 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01636 01637 detail::cuda_arg<cpu_value_type>(B), 01638 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01639 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01640 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01641 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01642 01643 converted_beta, 01644 detail::cuda_arg<cpu_value_type>(C), 01645 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01646 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01647 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01648 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01649 } 01650 else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B) 01651 { 01652 matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>> 01653 (converted_alpha, 01654 detail::cuda_arg<cpu_value_type>(A), 01655 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01656 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01657 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01658 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01659 01660 detail::cuda_arg<cpu_value_type>(B), 01661 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01662 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01663 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01664 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01665 01666 converted_beta, 01667 detail::cuda_arg<cpu_value_type>(C), 01668 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01669 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01670 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01671 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01672 } 01673 else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B) 01674 { 01675 matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>> 01676 (converted_alpha, 01677 detail::cuda_arg<cpu_value_type>(A), 01678 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01679 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01680 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01681 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01682 01683 detail::cuda_arg<cpu_value_type>(B), 01684 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01685 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01686 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01687 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01688 01689 converted_beta, 01690 detail::cuda_arg<cpu_value_type>(C), 01691 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01692 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01693 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01694 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01695 } 01697 01698 else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B) 01699 { 01700 matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>> 01701 (converted_alpha, 01702 detail::cuda_arg<cpu_value_type>(A), 01703 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01704 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01705 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01706 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01707 01708 detail::cuda_arg<cpu_value_type>(B), 01709 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01710 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01711 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01712 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01713 01714 converted_beta, 01715 detail::cuda_arg<cpu_value_type>(C), 01716 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01717 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01718 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01719 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01720 } 01721 else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B) 01722 { 01723 matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>> 01724 (converted_alpha, 01725 detail::cuda_arg<cpu_value_type>(A), 01726 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01727 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01728 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01729 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01730 01731 detail::cuda_arg<cpu_value_type>(B), 01732 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01733 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01734 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01735 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01736 01737 converted_beta, 01738 detail::cuda_arg<cpu_value_type>(C), 01739 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01740 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01741 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01742 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01743 } 01744 else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B) 01745 { 01746 matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>> 01747 (converted_alpha, 01748 detail::cuda_arg<cpu_value_type>(A), 01749 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01750 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01751 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01752 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01753 01754 detail::cuda_arg<cpu_value_type>(B), 01755 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01756 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01757 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01758 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01759 01760 converted_beta, 01761 detail::cuda_arg<cpu_value_type>(C), 01762 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01763 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01764 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01765 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01766 } 01767 else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B) 01768 { 01769 matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>> 01770 (converted_alpha, 01771 detail::cuda_arg<cpu_value_type>(A), 01772 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01773 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01774 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01775 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01776 01777 detail::cuda_arg<cpu_value_type>(B), 01778 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01779 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01780 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01781 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01782 01783 converted_beta, 01784 detail::cuda_arg<cpu_value_type>(C), 01785 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01786 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01787 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01788 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01789 } 01791 01792 else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B) 01793 { 01794 matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>> 01795 (converted_alpha, 01796 detail::cuda_arg<cpu_value_type>(A), 01797 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01798 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01799 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01800 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01801 01802 detail::cuda_arg<cpu_value_type>(B), 01803 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01804 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01805 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01806 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01807 01808 converted_beta, 01809 detail::cuda_arg<cpu_value_type>(C), 01810 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01811 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01812 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01813 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01814 } 01815 else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B) 01816 { 01817 matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>> 01818 (converted_alpha, 01819 detail::cuda_arg<cpu_value_type>(A), 01820 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01821 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01822 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01823 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01824 01825 detail::cuda_arg<cpu_value_type>(B), 01826 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01827 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01828 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01829 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01830 01831 converted_beta, 01832 detail::cuda_arg<cpu_value_type>(C), 01833 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01834 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01835 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01836 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01837 } 01838 else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B) 01839 { 01840 matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>> 01841 (converted_alpha, 01842 detail::cuda_arg<cpu_value_type>(A), 01843 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01844 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01845 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01846 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01847 01848 detail::cuda_arg<cpu_value_type>(B), 01849 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01850 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01851 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01852 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01853 01854 converted_beta, 01855 detail::cuda_arg<cpu_value_type>(C), 01856 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01857 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01858 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01859 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01860 } 01861 else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B) 01862 { 01863 matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>> 01864 (converted_alpha, 01865 detail::cuda_arg<cpu_value_type>(A), 01866 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01867 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01868 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01869 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01870 01871 detail::cuda_arg<cpu_value_type>(B), 01872 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01873 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01874 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01875 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01876 01877 converted_beta, 01878 detail::cuda_arg<cpu_value_type>(C), 01879 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01880 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01881 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01882 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01883 } 01885 01886 else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B) 01887 { 01888 matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>> 01889 (converted_alpha, 01890 detail::cuda_arg<cpu_value_type>(A), 01891 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01892 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01893 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01894 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01895 01896 detail::cuda_arg<cpu_value_type>(B), 01897 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01898 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01899 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01900 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01901 01902 converted_beta, 01903 detail::cuda_arg<cpu_value_type>(C), 01904 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01905 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01906 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01907 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01908 } 01909 else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B) 01910 { 01911 matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>> 01912 (converted_alpha, 01913 detail::cuda_arg<cpu_value_type>(A), 01914 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01915 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01916 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01917 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01918 01919 detail::cuda_arg<cpu_value_type>(B), 01920 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01921 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01922 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01923 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01924 01925 converted_beta, 01926 detail::cuda_arg<cpu_value_type>(C), 01927 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01928 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01929 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01930 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01931 } 01932 else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B) 01933 { 01934 matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>> 01935 (converted_alpha, 01936 detail::cuda_arg<cpu_value_type>(A), 01937 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01938 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01939 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01940 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01941 01942 detail::cuda_arg<cpu_value_type>(B), 01943 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01944 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01945 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01946 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01947 01948 converted_beta, 01949 detail::cuda_arg<cpu_value_type>(C), 01950 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01951 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01952 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01953 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01954 } 01955 else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B) 01956 { 01957 matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>> 01958 (converted_alpha, 01959 detail::cuda_arg<cpu_value_type>(A), 01960 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01961 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01962 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01963 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01964 01965 detail::cuda_arg<cpu_value_type>(B), 01966 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01967 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01968 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01969 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01970 01971 converted_beta, 01972 detail::cuda_arg<cpu_value_type>(C), 01973 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01974 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 01975 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 01976 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 01977 } 01979 01980 else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B) 01981 { 01982 matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>> 01983 (converted_alpha, 01984 detail::cuda_arg<cpu_value_type>(A), 01985 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 01986 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 01987 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 01988 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 01989 01990 detail::cuda_arg<cpu_value_type>(B), 01991 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 01992 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 01993 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 01994 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 01995 01996 converted_beta, 01997 detail::cuda_arg<cpu_value_type>(C), 01998 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 01999 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02000 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02001 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02002 } 02003 else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B) 02004 { 02005 matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>> 02006 (converted_alpha, 02007 detail::cuda_arg<cpu_value_type>(A), 02008 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02009 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02010 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02011 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02012 02013 detail::cuda_arg<cpu_value_type>(B), 02014 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02015 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02016 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02017 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02018 02019 converted_beta, 02020 detail::cuda_arg<cpu_value_type>(C), 02021 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02022 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02023 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02024 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02025 } 02026 else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B) 02027 { 02028 matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>> 02029 (converted_alpha, 02030 detail::cuda_arg<cpu_value_type>(A), 02031 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02032 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02033 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02034 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02035 02036 detail::cuda_arg<cpu_value_type>(B), 02037 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02038 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02039 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02040 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02041 02042 converted_beta, 02043 detail::cuda_arg<cpu_value_type>(C), 02044 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02045 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02046 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02047 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02048 } 02049 else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B) 02050 { 02051 matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>> 02052 (converted_alpha, 02053 detail::cuda_arg<cpu_value_type>(A), 02054 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02055 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02056 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02057 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02058 02059 detail::cuda_arg<cpu_value_type>(B), 02060 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02061 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02062 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02063 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02064 02065 converted_beta, 02066 detail::cuda_arg<cpu_value_type>(C), 02067 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02068 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02069 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02070 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02071 } 02073 02074 else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B) 02075 { 02076 matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>> 02077 (converted_alpha, 02078 detail::cuda_arg<cpu_value_type>(A), 02079 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02080 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02081 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02082 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02083 02084 detail::cuda_arg<cpu_value_type>(B), 02085 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02086 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02087 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02088 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02089 02090 converted_beta, 02091 detail::cuda_arg<cpu_value_type>(C), 02092 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02093 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02094 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02095 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02096 } 02097 else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B) 02098 { 02099 matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>> 02100 (converted_alpha, 02101 detail::cuda_arg<cpu_value_type>(A), 02102 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02103 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02104 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02105 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02106 02107 detail::cuda_arg<cpu_value_type>(B), 02108 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02109 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02110 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02111 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02112 02113 converted_beta, 02114 detail::cuda_arg<cpu_value_type>(C), 02115 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02116 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02117 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02118 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02119 } 02120 else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B) 02121 { 02122 matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>> 02123 (converted_alpha, 02124 detail::cuda_arg<cpu_value_type>(A), 02125 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02126 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02127 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02128 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02129 02130 detail::cuda_arg<cpu_value_type>(B), 02131 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02132 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02133 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02134 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02135 02136 converted_beta, 02137 detail::cuda_arg<cpu_value_type>(C), 02138 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02139 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02140 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02141 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02142 } 02143 else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B) 02144 { 02145 matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>> 02146 (converted_alpha, 02147 detail::cuda_arg<cpu_value_type>(A), 02148 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02149 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02150 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02151 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02152 02153 detail::cuda_arg<cpu_value_type>(B), 02154 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02155 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02156 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02157 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02158 02159 converted_beta, 02160 detail::cuda_arg<cpu_value_type>(C), 02161 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02162 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02163 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02164 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02165 } 02166 02167 02169 02170 else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B) 02171 { 02172 matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>> 02173 (converted_alpha, 02174 detail::cuda_arg<cpu_value_type>(A), 02175 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02176 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02177 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02178 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02179 02180 detail::cuda_arg<cpu_value_type>(B), 02181 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02182 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02183 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02184 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02185 02186 converted_beta, 02187 detail::cuda_arg<cpu_value_type>(C), 02188 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02189 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02190 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02191 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02192 } 02193 else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B) 02194 { 02195 matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>> 02196 (converted_alpha, 02197 detail::cuda_arg<cpu_value_type>(A), 02198 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02199 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02200 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02201 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02202 02203 detail::cuda_arg<cpu_value_type>(B), 02204 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02205 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02206 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02207 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02208 02209 converted_beta, 02210 detail::cuda_arg<cpu_value_type>(C), 02211 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02212 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02213 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02214 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02215 } 02216 else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B) 02217 { 02218 matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>> 02219 (converted_alpha, 02220 detail::cuda_arg<cpu_value_type>(A), 02221 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02222 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02223 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02224 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02225 02226 detail::cuda_arg<cpu_value_type>(B), 02227 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02228 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02229 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02230 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02231 02232 converted_beta, 02233 detail::cuda_arg<cpu_value_type>(C), 02234 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02235 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02236 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02237 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02238 } 02239 else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B) 02240 { 02241 matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>> 02242 (converted_alpha, 02243 detail::cuda_arg<cpu_value_type>(A), 02244 static_cast<unsigned int>(viennacl::traits::start1(A)), static_cast<unsigned int>(viennacl::traits::start2(A)), 02245 static_cast<unsigned int>(viennacl::traits::stride1(A)), static_cast<unsigned int>(viennacl::traits::stride2(A)), 02246 static_cast<unsigned int>(viennacl::traits::size1(A)), static_cast<unsigned int>(viennacl::traits::size2(A)), 02247 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)), 02248 02249 detail::cuda_arg<cpu_value_type>(B), 02250 static_cast<unsigned int>(viennacl::traits::start1(B)), static_cast<unsigned int>(viennacl::traits::start2(B)), 02251 static_cast<unsigned int>(viennacl::traits::stride1(B)), static_cast<unsigned int>(viennacl::traits::stride2(B)), 02252 static_cast<unsigned int>(viennacl::traits::size1(B)), static_cast<unsigned int>(viennacl::traits::size2(B)), 02253 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)), 02254 02255 converted_beta, 02256 detail::cuda_arg<cpu_value_type>(C), 02257 static_cast<unsigned int>(viennacl::traits::start1(C)), static_cast<unsigned int>(viennacl::traits::start2(C)), 02258 static_cast<unsigned int>(viennacl::traits::stride1(C)), static_cast<unsigned int>(viennacl::traits::stride2(C)), 02259 static_cast<unsigned int>(viennacl::traits::size1(C)), static_cast<unsigned int>(viennacl::traits::size2(C)), 02260 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) ); 02261 } 02262 02263 } 02264 02265 // C = A * B, using fast kernel 02266 template <typename T1, typename T2, typename T3, typename ScalarType > 02267 void prod_fast_kernel(const T1 & A, 02268 const T2 & B, 02269 T3 & C, 02270 ScalarType alpha, 02271 ScalarType beta, 02272 std::string kernel_name) 02273 { 02274 typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type cpu_value_type; 02275 02276 cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha); 02277 cpu_value_type cl_beta = static_cast<cpu_value_type>(beta); 02278 02279 /*viennacl::ocl::enqueue(k(cl_alpha, 02280 viennacl::traits::opencl_handle(A), 02281 cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)), 02282 cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)), 02283 cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)), 02284 cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)), 02285 02286 viennacl::traits::opencl_handle(B), 02287 cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)), 02288 cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)), 02289 cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)), 02290 cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)), 02291 02292 cl_beta, 02293 viennacl::traits::opencl_handle(C), 02294 cl_uint(viennacl::traits::start1(C)), cl_uint(viennacl::traits::start2(C)), 02295 cl_uint(viennacl::traits::stride1(C)), cl_uint(viennacl::traits::stride2(C)), 02296 cl_uint(viennacl::traits::size1(C)), cl_uint(viennacl::traits::size2(C)), 02297 cl_uint(viennacl::traits::internal_size1(C)), cl_uint(viennacl::traits::internal_size2(C)) 02298 ) 02299 );*/ 02300 02301 throw "not implemented yet"; 02302 } 02303 02304 template <typename T1, typename T2, typename T3, typename ScalarType > 02305 void prod(const T1 & A, bool transposed_A, 02306 const T2 & B, bool transposed_B, 02307 T3 & C, 02308 ScalarType alpha, 02309 ScalarType beta) 02310 { 02311 if ( (viennacl::traits::size1(A) < 64) 02312 || (viennacl::traits::size2(A) < 64) 02313 || (viennacl::traits::size1(B) < 64) ) //there is most likely not enough to compute, rendering kernel launch overhead considerable 02314 { 02315 prod_slow_kernel(A, transposed_A, 02316 B, transposed_B, 02317 C, alpha, beta); 02318 } 02319 /*else if ( (viennacl::traits::size1(A) % 64 == 0) 02320 && (viennacl::traits::size2(A) % 64 == 0) 02321 && (viennacl::traits::size1(B) % 64 == 0) ) // allows the use of the fast kernel only 02322 { 02323 prod_fast_kernel(A, B, C, alpha, beta); 02324 //prod_slow_kernel(A, B, C, slow_kernel_name); 02325 }*/ 02326 else //TODO: use four kernels 02327 { 02328 prod_slow_kernel(A, transposed_A, 02329 B, transposed_B, 02330 C, alpha, beta); 02331 } 02332 02333 } 02334 } // namespace detail 02335 02336 02342 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 02343 void prod_impl(const matrix_base<NumericT, F1> & A, 02344 const matrix_base<NumericT, F2> & B, 02345 matrix_base<NumericT, F3> & C, 02346 ScalarType alpha, 02347 ScalarType beta) 02348 { 02349 assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, B): size1(A) != size1(C)")); 02350 assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(A, B): size2(A) != size1(B)")); 02351 assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, B): size2(B) != size2(C)")); 02352 02353 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 02354 /*assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A)) 02355 && (viennacl::traits::handle(C) != viennacl::traits::handle(B)) 02356 && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/ 02357 02358 02359 detail::prod(A, false, 02360 B, false, 02361 C, alpha, beta); 02362 } 02363 02364 02365 02371 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 02372 void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, 02373 const matrix_base<NumericT, F1>, 02374 op_trans> & A, 02375 const matrix_base<NumericT, F2> & B, 02376 matrix_base<NumericT, F3> & C, 02377 ScalarType alpha, 02378 ScalarType beta) 02379 { 02380 //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl; 02381 //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl; 02382 assert( (viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(A) != size1(C)")); 02383 assert( (viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(trans(A), B): size1(A) != size1(B)")); 02384 assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(B) != size2(C)")); 02385 02386 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 02387 assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs())) 02388 && (viennacl::traits::handle(C) != viennacl::traits::handle(B)) 02389 && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!")); 02390 02391 detail::prod(A.lhs(), true, 02392 B, false, 02393 C, alpha, beta); 02394 } 02395 02396 02397 02398 02404 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 02405 void prod_impl(const matrix_base<NumericT, F1> & A, 02406 const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B, 02407 matrix_base<NumericT, F3> & C, 02408 ScalarType alpha, 02409 ScalarType beta) 02410 { 02411 assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, trans(B)): size1(A) != size1(C)")); 02412 assert( (viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs())) && bool("Size mismatch in C = prod(A, trans(B)): size2(A) != size2(B)")); 02413 assert( (viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, trans(B)): size1(B) != size2(C)")); 02414 02415 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 02416 detail::prod(A, false, 02417 B.lhs(), true, 02418 C, alpha, beta); 02419 } 02420 02421 02422 02428 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 02429 void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A, 02430 const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B, 02431 matrix_base<NumericT, F3> & C, 02432 ScalarType alpha, 02433 ScalarType beta) 02434 { 02435 assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size mismatch in C = prod(trans(A), trans(B)): size2(A) != size1(C)")); 02436 assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(A) != size2(B)")); 02437 assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(B) != size2(C)")); 02438 02439 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 02440 assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs())) 02441 && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs())) 02442 && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!")); 02443 02444 detail::prod(A.lhs(), true, 02445 B.lhs(), true, 02446 C, alpha, beta); 02447 } 02448 02449 02450 02451 02452 // 02454 // 02455 02456 02469 template <typename NumericT, typename F, typename S1> 02470 void scaled_rank_1_update(matrix_base<NumericT, F> & mat1, 02471 S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 02472 const vector_base<NumericT> & vec1, 02473 const vector_base<NumericT> & vec2) 02474 { 02475 assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)")); 02476 assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)")); 02477 02478 typedef NumericT value_type; 02479 02480 unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 02481 02482 value_type temporary_alpha = 0; 02483 if (viennacl::is_cpu_scalar<S1>::value) 02484 temporary_alpha = alpha; 02485 02486 if (viennacl::is_row_major<F>::value) 02487 { 02488 scaled_rank1_update_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 02489 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 02490 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 02491 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 02492 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 02493 02494 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 02495 options_alpha, 02496 02497 detail::cuda_arg<value_type>(vec1), 02498 static_cast<unsigned int>(viennacl::traits::start(vec1)), 02499 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 02500 static_cast<unsigned int>(viennacl::traits::size(vec1)), 02501 02502 detail::cuda_arg<value_type>(vec2), 02503 static_cast<unsigned int>(viennacl::traits::start(vec2)), 02504 static_cast<unsigned int>(viennacl::traits::stride(vec2)), 02505 static_cast<unsigned int>(viennacl::traits::size(vec2)) 02506 ); 02507 VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_row_kernel"); 02508 } 02509 else 02510 { 02511 scaled_rank1_update_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1), 02512 static_cast<unsigned int>(viennacl::traits::start1(mat1)), static_cast<unsigned int>(viennacl::traits::start2(mat1)), 02513 static_cast<unsigned int>(viennacl::traits::stride1(mat1)), static_cast<unsigned int>(viennacl::traits::stride2(mat1)), 02514 static_cast<unsigned int>(viennacl::traits::size1(mat1)), static_cast<unsigned int>(viennacl::traits::size2(mat1)), 02515 static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)), 02516 02517 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 02518 options_alpha, 02519 02520 detail::cuda_arg<value_type>(vec1), 02521 static_cast<unsigned int>(viennacl::traits::start(vec1)), 02522 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 02523 static_cast<unsigned int>(viennacl::traits::size(vec1)), 02524 02525 detail::cuda_arg<value_type>(vec2), 02526 static_cast<unsigned int>(viennacl::traits::start(vec2)), 02527 static_cast<unsigned int>(viennacl::traits::stride(vec2)), 02528 static_cast<unsigned int>(viennacl::traits::size(vec2)) 02529 ); 02530 VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_col_kernel"); 02531 } 02532 } 02533 02534 } // namespace opencl 02535 } //namespace linalg 02536 } //namespace viennacl 02537 02538 02539 #endif