ViennaCL - The Vienna Computing Library
1.5.2
|
00001 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ 00002 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00025 #include <cmath> 00026 #include "viennacl/forwards.h" 00027 #include "viennacl/scalar.hpp" 00028 #include "viennacl/tools/tools.hpp" 00029 #include "viennacl/meta/predicate.hpp" 00030 #include "viennacl/meta/enable_if.hpp" 00031 #include "viennacl/traits/size.hpp" 00032 #include "viennacl/traits/start.hpp" 00033 #include "viennacl/traits/stride.hpp" 00034 00035 namespace viennacl 00036 { 00037 namespace linalg 00038 { 00039 namespace cuda 00040 { 00041 00042 // 00043 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! 00044 // 00045 00046 00048 00049 // gpu scalar 00050 template <typename T> 00051 __global__ void av_kernel(T * vec1, 00052 unsigned int start1, 00053 unsigned int inc1, 00054 unsigned int size1, 00055 00056 const T * fac2, 00057 unsigned int options2, 00058 const T * vec2, 00059 unsigned int start2, 00060 unsigned int inc2) 00061 { 00062 T alpha = *fac2; 00063 if (options2 & (1 << 0)) 00064 alpha = -alpha; 00065 00066 if (options2 & (1 << 1)) 00067 { 00068 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00069 i < size1; 00070 i += gridDim.x * blockDim.x) 00071 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha; 00072 } 00073 else 00074 { 00075 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00076 i < size1; 00077 i += gridDim.x * blockDim.x) 00078 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha; 00079 } 00080 } 00081 00082 // cpu scalar 00083 template <typename T> 00084 __global__ void av_kernel(T * vec1, 00085 unsigned int start1, 00086 unsigned int inc1, 00087 unsigned int size1, 00088 00089 T fac2, 00090 unsigned int options2, 00091 const T * vec2, 00092 unsigned int start2, 00093 unsigned int inc2) 00094 { 00095 T alpha = fac2; 00096 if (options2 & (1 << 0)) 00097 alpha = -alpha; 00098 00099 if (options2 & (1 << 1)) 00100 { 00101 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00102 i < size1; 00103 i += gridDim.x * blockDim.x) 00104 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha; 00105 } 00106 else 00107 { 00108 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00109 i < size1; 00110 i += gridDim.x * blockDim.x) 00111 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha; 00112 } 00113 } 00114 00115 00116 00117 template <typename T, typename ScalarType1> 00118 void av(vector_base<T> & vec1, 00119 vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) 00120 { 00121 typedef T value_type; 00122 00123 unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00124 00125 value_type data_alpha = alpha; 00126 if (flip_sign_alpha) 00127 data_alpha = -data_alpha; 00128 if (reciprocal_alpha) 00129 data_alpha = static_cast<value_type>(1) / data_alpha; 00130 00131 value_type temporary_alpha = 0; 00132 if (viennacl::is_cpu_scalar<ScalarType1>::value) 00133 temporary_alpha = alpha; 00134 00135 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 00136 static_cast<unsigned int>(viennacl::traits::start(vec1)), 00137 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 00138 static_cast<unsigned int>(viennacl::traits::size(vec1)), 00139 00140 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00141 options_alpha, 00142 detail::cuda_arg<value_type>(vec2), 00143 static_cast<unsigned int>(viennacl::traits::start(vec2)), 00144 static_cast<unsigned int>(viennacl::traits::stride(vec2)) ); 00145 VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel"); 00146 } 00147 00148 00150 00151 // alpha and beta on GPU 00152 template <typename T> 00153 __global__ void avbv_kernel(T * vec1, 00154 unsigned int start1, 00155 unsigned int inc1, 00156 unsigned int size1, 00157 00158 const T * fac2, 00159 unsigned int options2, 00160 const T * vec2, 00161 unsigned int start2, 00162 unsigned int inc2, 00163 00164 const T * fac3, 00165 unsigned int options3, 00166 const T * vec3, 00167 unsigned int start3, 00168 unsigned int inc3) 00169 { 00170 T alpha = *fac2; 00171 if (options2 & (1 << 0)) 00172 alpha = -alpha; 00173 00174 T beta = *fac3; 00175 if (options3 & (1 << 0)) 00176 beta = -beta; 00177 00178 if (options2 & (1 << 1)) 00179 { 00180 if (options3 & (1 << 1)) 00181 { 00182 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00183 i < size1; 00184 i += gridDim.x * blockDim.x) 00185 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00186 } 00187 else 00188 { 00189 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00190 i < size1; 00191 i += gridDim.x * blockDim.x) 00192 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00193 } 00194 } 00195 else 00196 { 00197 if (options3 & (1 << 1)) 00198 { 00199 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00200 i < size1; 00201 i += gridDim.x * blockDim.x) 00202 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00203 } 00204 else 00205 { 00206 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00207 i < size1; 00208 i += gridDim.x * blockDim.x) 00209 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00210 } 00211 } 00212 } 00213 00214 // alpha on CPU, beta on GPU 00215 template <typename T> 00216 __global__ void avbv_kernel(T * vec1, 00217 unsigned int start1, 00218 unsigned int inc1, 00219 unsigned int size1, 00220 00221 T fac2, 00222 unsigned int options2, 00223 const T * vec2, 00224 unsigned int start2, 00225 unsigned int inc2, 00226 00227 const T * fac3, 00228 unsigned int options3, 00229 const T * vec3, 00230 unsigned int start3, 00231 unsigned int inc3) 00232 { 00233 T alpha = fac2; 00234 if (options2 & (1 << 0)) 00235 alpha = -alpha; 00236 00237 T beta = *fac3; 00238 if (options3 & (1 << 0)) 00239 beta = -beta; 00240 00241 if (options2 & (1 << 1)) 00242 { 00243 if (options3 & (1 << 1)) 00244 { 00245 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00246 i < size1; 00247 i += gridDim.x * blockDim.x) 00248 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00249 } 00250 else 00251 { 00252 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00253 i < size1; 00254 i += gridDim.x * blockDim.x) 00255 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00256 } 00257 } 00258 else 00259 { 00260 if (options3 & (1 << 1)) 00261 { 00262 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00263 i < size1; 00264 i += gridDim.x * blockDim.x) 00265 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00266 } 00267 else 00268 { 00269 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00270 i < size1; 00271 i += gridDim.x * blockDim.x) 00272 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00273 } 00274 } 00275 } 00276 00277 // alpha on GPU, beta on CPU 00278 template <typename T> 00279 __global__ void avbv_kernel(T * vec1, 00280 unsigned int start1, 00281 unsigned int inc1, 00282 unsigned int size1, 00283 00284 const T * fac2, 00285 unsigned int options2, 00286 const T * vec2, 00287 unsigned int start2, 00288 unsigned int inc2, 00289 00290 T fac3, 00291 unsigned int options3, 00292 const T * vec3, 00293 unsigned int start3, 00294 unsigned int inc3) 00295 { 00296 T alpha = *fac2; 00297 if (options2 & (1 << 0)) 00298 alpha = -alpha; 00299 00300 T beta = fac3; 00301 if (options3 & (1 << 0)) 00302 beta = -beta; 00303 00304 if (options2 & (1 << 1)) 00305 { 00306 if (options3 & (1 << 1)) 00307 { 00308 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00309 i < size1; 00310 i += gridDim.x * blockDim.x) 00311 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00312 } 00313 else 00314 { 00315 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00316 i < size1; 00317 i += gridDim.x * blockDim.x) 00318 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00319 } 00320 } 00321 else 00322 { 00323 if (options3 & (1 << 1)) 00324 { 00325 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00326 i < size1; 00327 i += gridDim.x * blockDim.x) 00328 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00329 } 00330 else 00331 { 00332 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00333 i < size1; 00334 i += gridDim.x * blockDim.x) 00335 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00336 } 00337 } 00338 } 00339 00340 // alpha and beta on CPU 00341 template <typename T> 00342 __global__ void avbv_kernel(T * vec1, 00343 unsigned int start1, 00344 unsigned int inc1, 00345 unsigned int size1, 00346 00347 T fac2, 00348 unsigned int options2, 00349 const T * vec2, 00350 unsigned int start2, 00351 unsigned int inc2, 00352 00353 T fac3, 00354 unsigned int options3, 00355 const T * vec3, 00356 unsigned int start3, 00357 unsigned int inc3) 00358 { 00359 T alpha = fac2; 00360 if (options2 & (1 << 0)) 00361 alpha = -alpha; 00362 00363 T beta = fac3; 00364 if (options3 & (1 << 0)) 00365 beta = -beta; 00366 00367 if (options2 & (1 << 1)) 00368 { 00369 if (options3 & (1 << 1)) 00370 { 00371 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00372 i < size1; 00373 i += gridDim.x * blockDim.x) 00374 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00375 } 00376 else 00377 { 00378 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00379 i < size1; 00380 i += gridDim.x * blockDim.x) 00381 vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00382 } 00383 } 00384 else 00385 { 00386 if (options3 & (1 << 1)) 00387 { 00388 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00389 i < size1; 00390 i += gridDim.x * blockDim.x) 00391 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00392 } 00393 else 00394 { 00395 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00396 i < size1; 00397 i += gridDim.x * blockDim.x) 00398 vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00399 } 00400 } 00401 } 00402 00403 00404 00405 00406 template <typename T, typename ScalarType1, typename ScalarType2> 00407 void avbv(vector_base<T> & vec1, 00408 vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 00409 vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) 00410 { 00411 typedef T value_type; 00412 00413 unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00414 00415 value_type data_alpha = alpha; 00416 if (flip_sign_alpha) 00417 data_alpha = -data_alpha; 00418 if (reciprocal_alpha) 00419 data_alpha = static_cast<value_type>(1) / data_alpha; 00420 00421 value_type temporary_alpha = 0; 00422 if (viennacl::is_cpu_scalar<ScalarType1>::value) 00423 temporary_alpha = alpha; 00424 00425 unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); 00426 00427 value_type temporary_beta = 0; 00428 if (viennacl::is_cpu_scalar<ScalarType2>::value) 00429 temporary_beta = beta; 00430 00431 00432 avbv_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 00433 static_cast<unsigned int>(viennacl::traits::start(vec1)), 00434 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 00435 static_cast<unsigned int>(viennacl::traits::size(vec1)), 00436 00437 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00438 options_alpha, 00439 detail::cuda_arg<value_type>(vec2), 00440 static_cast<unsigned int>(viennacl::traits::start(vec2)), 00441 static_cast<unsigned int>(viennacl::traits::stride(vec2)), 00442 00443 detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), 00444 options_beta, 00445 detail::cuda_arg<value_type>(vec3), 00446 static_cast<unsigned int>(viennacl::traits::start(vec3)), 00447 static_cast<unsigned int>(viennacl::traits::stride(vec3)) ); 00448 VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel"); 00449 } 00450 00451 00453 00454 00455 // alpha and beta on GPU 00456 template <typename T> 00457 __global__ void avbv_v_kernel(T * vec1, 00458 unsigned int start1, 00459 unsigned int inc1, 00460 unsigned int size1, 00461 00462 const T * fac2, 00463 unsigned int options2, 00464 const T * vec2, 00465 unsigned int start2, 00466 unsigned int inc2, 00467 00468 const T * fac3, 00469 unsigned int options3, 00470 const T * vec3, 00471 unsigned int start3, 00472 unsigned int inc3) 00473 { 00474 T alpha = *fac2; 00475 if (options2 & (1 << 0)) 00476 alpha = -alpha; 00477 00478 T beta = *fac3; 00479 if (options3 & (1 << 0)) 00480 beta = -beta; 00481 00482 if (options2 & (1 << 1)) 00483 { 00484 if (options3 & (1 << 1)) 00485 { 00486 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00487 i < size1; 00488 i += gridDim.x * blockDim.x) 00489 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00490 } 00491 else 00492 { 00493 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00494 i < size1; 00495 i += gridDim.x * blockDim.x) 00496 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00497 } 00498 } 00499 else 00500 { 00501 if (options3 & (1 << 1)) 00502 { 00503 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00504 i < size1; 00505 i += gridDim.x * blockDim.x) 00506 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00507 } 00508 else 00509 { 00510 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00511 i < size1; 00512 i += gridDim.x * blockDim.x) 00513 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00514 } 00515 } 00516 } 00517 00518 // alpha on CPU, beta on GPU 00519 template <typename T> 00520 __global__ void avbv_v_kernel(T * vec1, 00521 unsigned int start1, 00522 unsigned int inc1, 00523 unsigned int size1, 00524 00525 T fac2, 00526 unsigned int options2, 00527 const T * vec2, 00528 unsigned int start2, 00529 unsigned int inc2, 00530 00531 const T * fac3, 00532 unsigned int options3, 00533 const T * vec3, 00534 unsigned int start3, 00535 unsigned int inc3) 00536 { 00537 T alpha = fac2; 00538 if (options2 & (1 << 0)) 00539 alpha = -alpha; 00540 00541 T beta = *fac3; 00542 if (options3 & (1 << 0)) 00543 beta = -beta; 00544 00545 if (options2 & (1 << 1)) 00546 { 00547 if (options3 & (1 << 1)) 00548 { 00549 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00550 i < size1; 00551 i += gridDim.x * blockDim.x) 00552 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00553 } 00554 else 00555 { 00556 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00557 i < size1; 00558 i += gridDim.x * blockDim.x) 00559 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00560 } 00561 } 00562 else 00563 { 00564 if (options3 & (1 << 1)) 00565 { 00566 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00567 i < size1; 00568 i += gridDim.x * blockDim.x) 00569 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00570 } 00571 else 00572 { 00573 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00574 i < size1; 00575 i += gridDim.x * blockDim.x) 00576 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00577 } 00578 } 00579 } 00580 00581 // alpha on GPU, beta on CPU 00582 template <typename T> 00583 __global__ void avbv_v_kernel(T * vec1, 00584 unsigned int start1, 00585 unsigned int inc1, 00586 unsigned int size1, 00587 00588 const T * fac2, 00589 unsigned int options2, 00590 const T * vec2, 00591 unsigned int start2, 00592 unsigned int inc2, 00593 00594 T fac3, 00595 unsigned int options3, 00596 const T * vec3, 00597 unsigned int start3, 00598 unsigned int inc3) 00599 { 00600 T alpha = *fac2; 00601 if (options2 & (1 << 0)) 00602 alpha = -alpha; 00603 00604 T beta = fac3; 00605 if (options3 & (1 << 0)) 00606 beta = -beta; 00607 00608 if (options2 & (1 << 1)) 00609 { 00610 if (options3 & (1 << 1)) 00611 { 00612 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00613 i < size1; 00614 i += gridDim.x * blockDim.x) 00615 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00616 } 00617 else 00618 { 00619 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00620 i < size1; 00621 i += gridDim.x * blockDim.x) 00622 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00623 } 00624 } 00625 else 00626 { 00627 if (options3 & (1 << 1)) 00628 { 00629 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00630 i < size1; 00631 i += gridDim.x * blockDim.x) 00632 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00633 } 00634 else 00635 { 00636 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00637 i < size1; 00638 i += gridDim.x * blockDim.x) 00639 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00640 } 00641 } 00642 } 00643 00644 // alpha and beta on CPU 00645 template <typename T> 00646 __global__ void avbv_v_kernel(T * vec1, 00647 unsigned int start1, 00648 unsigned int inc1, 00649 unsigned int size1, 00650 00651 T fac2, 00652 unsigned int options2, 00653 const T * vec2, 00654 unsigned int start2, 00655 unsigned int inc2, 00656 00657 T fac3, 00658 unsigned int options3, 00659 const T * vec3, 00660 unsigned int start3, 00661 unsigned int inc3) 00662 { 00663 T alpha = fac2; 00664 if (options2 & (1 << 0)) 00665 alpha = -alpha; 00666 00667 T beta = fac3; 00668 if (options3 & (1 << 0)) 00669 beta = -beta; 00670 00671 if (options2 & (1 << 1)) 00672 { 00673 if (options3 & (1 << 1)) 00674 { 00675 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00676 i < size1; 00677 i += gridDim.x * blockDim.x) 00678 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta; 00679 } 00680 else 00681 { 00682 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00683 i < size1; 00684 i += gridDim.x * blockDim.x) 00685 vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta; 00686 } 00687 } 00688 else 00689 { 00690 if (options3 & (1 << 1)) 00691 { 00692 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00693 i < size1; 00694 i += gridDim.x * blockDim.x) 00695 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta; 00696 } 00697 else 00698 { 00699 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00700 i < size1; 00701 i += gridDim.x * blockDim.x) 00702 vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta; 00703 } 00704 } 00705 } 00706 00707 00708 template <typename T, typename ScalarType1, typename ScalarType2> 00709 void avbv_v(vector_base<T> & vec1, 00710 vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 00711 vector_base<T> const & vec3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) 00712 { 00713 typedef T value_type; 00714 00715 unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00716 00717 value_type data_alpha = alpha; 00718 if (flip_sign_alpha) 00719 data_alpha = -data_alpha; 00720 if (reciprocal_alpha) 00721 data_alpha = static_cast<value_type>(1) / data_alpha; 00722 00723 value_type temporary_alpha = 0; 00724 if (viennacl::is_cpu_scalar<ScalarType1>::value) 00725 temporary_alpha = alpha; 00726 00727 unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); 00728 00729 value_type temporary_beta = 0; 00730 if (viennacl::is_cpu_scalar<ScalarType2>::value) 00731 temporary_beta = beta; 00732 00733 00734 avbv_v_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 00735 static_cast<unsigned int>(viennacl::traits::start(vec1)), 00736 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 00737 static_cast<unsigned int>(viennacl::traits::size(vec1)), 00738 00739 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 00740 options_alpha, 00741 detail::cuda_arg<value_type>(vec2), 00742 static_cast<unsigned int>(viennacl::traits::start(vec2)), 00743 static_cast<unsigned int>(viennacl::traits::stride(vec2)), 00744 00745 detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)), 00746 options_beta, 00747 detail::cuda_arg<value_type>(vec3), 00748 static_cast<unsigned int>(viennacl::traits::start(vec3)), 00749 static_cast<unsigned int>(viennacl::traits::stride(vec3)) ); 00750 } 00751 00752 00754 00755 template <typename T> 00756 __global__ void vector_assign_kernel(T * vec1, 00757 unsigned int start1, 00758 unsigned int inc1, 00759 unsigned int size1, 00760 unsigned int internal_size1, 00761 00762 T alpha) 00763 { 00764 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00765 i < size1; 00766 i += gridDim.x * blockDim.x) 00767 vec1[i*inc1+start1] = (i < size1) ? alpha : 0; 00768 } 00769 00776 template <typename T, typename S1> 00777 void vector_assign(vector_base<T> & vec1, const S1 & alpha, bool up_to_internal_size = false) 00778 { 00779 typedef T value_type; 00780 00781 value_type temporary_alpha = 0; 00782 if (viennacl::is_cpu_scalar<S1>::value) 00783 temporary_alpha = alpha; 00784 00785 unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1)); 00786 00787 vector_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 00788 static_cast<unsigned int>(viennacl::traits::start(vec1)), 00789 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 00790 size, 00791 static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding. 00792 00793 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) ); 00794 VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_v_kernel"); 00795 } 00796 00798 00799 template <typename T> 00800 __global__ void vector_swap_kernel(T * vec1, 00801 unsigned int start1, 00802 unsigned int inc1, 00803 unsigned int size1, 00804 00805 T * vec2, 00806 unsigned int start2, 00807 unsigned int inc2) 00808 { 00809 T tmp; 00810 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00811 i < size1; 00812 i += gridDim.x * blockDim.x) 00813 { 00814 tmp = vec2[i*inc2+start2]; 00815 vec2[i*inc2+start2] = vec1[i*inc1+start1]; 00816 vec1[i*inc1+start1] = tmp; 00817 } 00818 } 00819 00820 00826 template <typename T> 00827 void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2) 00828 { 00829 typedef T value_type; 00830 00831 vector_swap_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 00832 static_cast<unsigned int>(viennacl::traits::start(vec1)), 00833 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 00834 static_cast<unsigned int>(viennacl::traits::size(vec1)), 00835 00836 detail::cuda_arg<value_type>(vec2), 00837 static_cast<unsigned int>(viennacl::traits::start(vec2)), 00838 static_cast<unsigned int>(viennacl::traits::stride(vec2)) ); 00839 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel"); 00840 } 00841 00843 00844 template <typename T> 00845 __global__ void element_op_kernel(T * vec1, 00846 unsigned int start1, 00847 unsigned int inc1, 00848 unsigned int size1, 00849 00850 T const * vec2, 00851 unsigned int start2, 00852 unsigned int inc2, 00853 00854 T const * vec3, 00855 unsigned int start3, 00856 unsigned int inc3, 00857 00858 unsigned int op_type 00859 ) 00860 { 00861 if (op_type == 2) 00862 { 00863 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00864 i < size1; 00865 i += gridDim.x * blockDim.x) 00866 { 00867 vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]); 00868 } 00869 } 00870 else if (op_type == 1) 00871 { 00872 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00873 i < size1; 00874 i += gridDim.x * blockDim.x) 00875 { 00876 vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; 00877 } 00878 } 00879 else if (op_type == 0) 00880 { 00881 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00882 i < size1; 00883 i += gridDim.x * blockDim.x) 00884 { 00885 vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; 00886 } 00887 } 00888 } 00889 00890 template <typename T> 00891 __global__ void element_op_int_kernel(T * vec1, 00892 unsigned int start1, 00893 unsigned int inc1, 00894 unsigned int size1, 00895 00896 T const * vec2, 00897 unsigned int start2, 00898 unsigned int inc2, 00899 00900 T const * vec3, 00901 unsigned int start3, 00902 unsigned int inc3, 00903 00904 unsigned int op_type 00905 ) 00906 { 00907 if (op_type == 1) 00908 { 00909 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00910 i < size1; 00911 i += gridDim.x * blockDim.x) 00912 { 00913 vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3]; 00914 } 00915 } 00916 else if (op_type == 0) 00917 { 00918 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; 00919 i < size1; 00920 i += gridDim.x * blockDim.x) 00921 { 00922 vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3]; 00923 } 00924 } 00925 } 00926 00932 template <typename T, typename OP> 00933 void element_op(vector_base<T> & vec1, 00934 vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy) 00935 { 00936 typedef T value_type; 00937 00938 unsigned int op_type = 2; //0: product, 1: division, 2: power 00939 if (viennacl::is_division<OP>::value) 00940 op_type = 1; 00941 else if (viennacl::is_product<OP>::value) 00942 op_type = 0; 00943 00944 element_op_int_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 00945 static_cast<unsigned int>(viennacl::traits::start(vec1)), 00946 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 00947 static_cast<unsigned int>(viennacl::traits::size(vec1)), 00948 00949 detail::cuda_arg<value_type>(proxy.lhs()), 00950 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 00951 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), 00952 00953 detail::cuda_arg<value_type>(proxy.rhs()), 00954 static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), 00955 static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), 00956 00957 op_type 00958 ); 00959 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); 00960 } 00961 00962 template <typename OP> 00963 void element_op(vector_base<float> & vec1, 00964 vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OP> > const & proxy) 00965 { 00966 typedef float value_type; 00967 00968 unsigned int op_type = 2; //0: product, 1: division, 2: power 00969 if (viennacl::is_division<OP>::value) 00970 op_type = 1; 00971 else if (viennacl::is_product<OP>::value) 00972 op_type = 0; 00973 00974 element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 00975 static_cast<unsigned int>(viennacl::traits::start(vec1)), 00976 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 00977 static_cast<unsigned int>(viennacl::traits::size(vec1)), 00978 00979 detail::cuda_arg<value_type>(proxy.lhs()), 00980 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 00981 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), 00982 00983 detail::cuda_arg<value_type>(proxy.rhs()), 00984 static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), 00985 static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), 00986 00987 op_type 00988 ); 00989 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); 00990 } 00991 00992 template <typename OP> 00993 void element_op(vector_base<double> & vec1, 00994 vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OP> > const & proxy) 00995 { 00996 typedef double value_type; 00997 00998 unsigned int op_type = 2; //0: product, 1: division, 2: power 00999 if (viennacl::is_division<OP>::value) 01000 op_type = 1; 01001 else if (viennacl::is_product<OP>::value) 01002 op_type = 0; 01003 01004 element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01005 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01006 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01007 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01008 01009 detail::cuda_arg<value_type>(proxy.lhs()), 01010 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01011 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())), 01012 01013 detail::cuda_arg<value_type>(proxy.rhs()), 01014 static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())), 01015 static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())), 01016 01017 op_type 01018 ); 01019 VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel"); 01020 } 01021 01023 01024 // Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code 01025 // Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context? 01026 01027 // acos 01028 template <typename T> __global__ void vec_element_acos_kernel( 01029 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01030 T const * vec2, unsigned int start2, unsigned int inc2) 01031 { 01032 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01033 vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]); 01034 } 01035 01036 template <typename T> 01037 void element_op(vector_base<T> & vec1, 01038 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_acos> > const & proxy) 01039 { 01040 typedef T value_type; 01041 01042 vec_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01043 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01044 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01045 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01046 detail::cuda_arg<value_type>(proxy.lhs()), 01047 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01048 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01049 ); 01050 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel"); 01051 } 01052 01053 // asin 01054 template <typename T> __global__ void vec_element_asin_kernel( 01055 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01056 T const * vec2, unsigned int start2, unsigned int inc2) 01057 { 01058 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01059 vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]); 01060 } 01061 01062 template <typename T> 01063 void element_op(vector_base<T> & vec1, 01064 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_asin> > const & proxy) 01065 { 01066 typedef T value_type; 01067 01068 vec_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01069 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01070 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01071 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01072 detail::cuda_arg<value_type>(proxy.lhs()), 01073 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01074 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01075 ); 01076 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel"); 01077 } 01078 01079 01080 // atan 01081 template <typename T> __global__ void vec_element_atan_kernel( 01082 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01083 T const * vec2, unsigned int start2, unsigned int inc2) 01084 { 01085 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01086 vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]); 01087 } 01088 01089 template <typename T> 01090 void element_op(vector_base<T> & vec1, 01091 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_atan> > const & proxy) 01092 { 01093 typedef T value_type; 01094 01095 vec_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01096 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01097 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01098 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01099 detail::cuda_arg<value_type>(proxy.lhs()), 01100 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01101 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01102 ); 01103 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel"); 01104 } 01105 01106 01107 // ceil 01108 template <typename T> __global__ void vec_element_ceil_kernel( 01109 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01110 T const * vec2, unsigned int start2, unsigned int inc2) 01111 { 01112 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01113 vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]); 01114 } 01115 01116 template <typename T> 01117 void element_op(vector_base<T> & vec1, 01118 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_ceil> > const & proxy) 01119 { 01120 typedef T value_type; 01121 01122 vec_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01123 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01124 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01125 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01126 detail::cuda_arg<value_type>(proxy.lhs()), 01127 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01128 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01129 ); 01130 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel"); 01131 } 01132 01133 01134 // cos 01135 template <typename T> __global__ void vec_element_cos_kernel( 01136 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01137 T const * vec2, unsigned int start2, unsigned int inc2) 01138 { 01139 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01140 vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]); 01141 } 01142 01143 template <typename T> 01144 void element_op(vector_base<T> & vec1, 01145 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_cos> > const & proxy) 01146 { 01147 typedef T value_type; 01148 01149 vec_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01150 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01151 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01152 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01153 detail::cuda_arg<value_type>(proxy.lhs()), 01154 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01155 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01156 ); 01157 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel"); 01158 } 01159 01160 01161 // cosh 01162 template <typename T> __global__ void vec_element_cosh_kernel( 01163 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01164 T const * vec2, unsigned int start2, unsigned int inc2) 01165 { 01166 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01167 vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]); 01168 } 01169 01170 template <typename T> 01171 void element_op(vector_base<T> & vec1, 01172 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_cosh> > const & proxy) 01173 { 01174 typedef T value_type; 01175 01176 vec_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01177 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01178 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01179 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01180 detail::cuda_arg<value_type>(proxy.lhs()), 01181 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01182 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01183 ); 01184 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel"); 01185 } 01186 01187 01188 // exp 01189 template <typename T> __global__ void vec_element_exp_kernel( 01190 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01191 T const * vec2, unsigned int start2, unsigned int inc2) 01192 { 01193 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01194 vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]); 01195 } 01196 01197 template <typename T> 01198 void element_op(vector_base<T> & vec1, 01199 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_exp> > const & proxy) 01200 { 01201 typedef T value_type; 01202 01203 vec_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01204 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01205 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01206 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01207 detail::cuda_arg<value_type>(proxy.lhs()), 01208 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01209 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01210 ); 01211 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel"); 01212 } 01213 01214 01215 // fabs 01216 template <typename T> __global__ void vec_element_fabs_kernel( 01217 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01218 T const * vec2, unsigned int start2, unsigned int inc2) 01219 { 01220 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01221 vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]); 01222 } 01223 01224 template <typename T> 01225 void element_op(vector_base<T> & vec1, 01226 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_fabs> > const & proxy) 01227 { 01228 typedef T value_type; 01229 01230 vec_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01231 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01232 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01233 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01234 detail::cuda_arg<value_type>(proxy.lhs()), 01235 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01236 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01237 ); 01238 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel"); 01239 } 01240 01241 // abs 01242 template <typename T> __global__ void vec_element_abs_kernel( 01243 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01244 T const * vec2, unsigned int start2, unsigned int inc2) 01245 { 01246 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01247 vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]); 01248 } 01249 01250 template <typename T> 01251 void element_op(vector_base<T> & vec1, 01252 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_abs> > const & proxy) 01253 { 01254 typedef T value_type; 01255 01256 vec_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01257 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01258 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01259 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01260 detail::cuda_arg<value_type>(proxy.lhs()), 01261 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01262 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01263 ); 01264 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel"); 01265 } 01266 01267 01268 01269 // floor 01270 template <typename T> __global__ void vec_element_floor_kernel( 01271 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01272 T const * vec2, unsigned int start2, unsigned int inc2) 01273 { 01274 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01275 vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]); 01276 } 01277 01278 template <typename T> 01279 void element_op(vector_base<T> & vec1, 01280 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_floor> > const & proxy) 01281 { 01282 typedef T value_type; 01283 01284 vec_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01285 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01286 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01287 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01288 detail::cuda_arg<value_type>(proxy.lhs()), 01289 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01290 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01291 ); 01292 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel"); 01293 } 01294 01295 01296 // log 01297 template <typename T> __global__ void vec_element_log_kernel( 01298 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01299 T const * vec2, unsigned int start2, unsigned int inc2) 01300 { 01301 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01302 vec1[i*inc1+start1] = log(vec2[i*inc2+start2]); 01303 } 01304 01305 template <typename T> 01306 void element_op(vector_base<T> & vec1, 01307 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_log> > const & proxy) 01308 { 01309 typedef T value_type; 01310 01311 vec_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01312 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01313 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01314 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01315 detail::cuda_arg<value_type>(proxy.lhs()), 01316 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01317 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01318 ); 01319 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel"); 01320 } 01321 01322 01323 // log10 01324 template <typename T> __global__ void vec_element_log10_kernel( 01325 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01326 T const * vec2, unsigned int start2, unsigned int inc2) 01327 { 01328 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01329 vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]); 01330 } 01331 01332 template <typename T> 01333 void element_op(vector_base<T> & vec1, 01334 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_log10> > const & proxy) 01335 { 01336 typedef T value_type; 01337 01338 vec_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01339 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01340 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01341 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01342 detail::cuda_arg<value_type>(proxy.lhs()), 01343 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01344 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01345 ); 01346 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel"); 01347 } 01348 01349 01350 // sin 01351 template <typename T> __global__ void vec_element_sin_kernel( 01352 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01353 T const * vec2, unsigned int start2, unsigned int inc2) 01354 { 01355 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01356 vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]); 01357 } 01358 01359 template <typename T> 01360 void element_op(vector_base<T> & vec1, 01361 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sin> > const & proxy) 01362 { 01363 typedef T value_type; 01364 01365 vec_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01366 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01367 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01368 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01369 detail::cuda_arg<value_type>(proxy.lhs()), 01370 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01371 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01372 ); 01373 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel"); 01374 } 01375 01376 01377 // sinh 01378 template <typename T> __global__ void vec_element_sinh_kernel( 01379 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01380 T const * vec2, unsigned int start2, unsigned int inc2) 01381 { 01382 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01383 vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]); 01384 } 01385 01386 template <typename T> 01387 void element_op(vector_base<T> & vec1, 01388 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sinh> > const & proxy) 01389 { 01390 typedef T value_type; 01391 01392 vec_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01393 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01394 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01395 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01396 detail::cuda_arg<value_type>(proxy.lhs()), 01397 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01398 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01399 ); 01400 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel"); 01401 } 01402 01403 01404 // sqrt 01405 template <typename T> __global__ void vec_element_sqrt_kernel( 01406 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01407 T const * vec2, unsigned int start2, unsigned int inc2) 01408 { 01409 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01410 vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]); 01411 } 01412 01413 template <typename T> 01414 void element_op(vector_base<T> & vec1, 01415 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sqrt> > const & proxy) 01416 { 01417 typedef T value_type; 01418 01419 vec_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01420 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01421 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01422 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01423 detail::cuda_arg<value_type>(proxy.lhs()), 01424 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01425 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01426 ); 01427 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel"); 01428 } 01429 01430 01431 // tan 01432 template <typename T> __global__ void vec_element_tan_kernel( 01433 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01434 T const * vec2, unsigned int start2, unsigned int inc2) 01435 { 01436 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01437 vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]); 01438 } 01439 01440 template <typename T> 01441 void element_op(vector_base<T> & vec1, 01442 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_tan> > const & proxy) 01443 { 01444 typedef T value_type; 01445 01446 vec_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01447 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01448 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01449 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01450 detail::cuda_arg<value_type>(proxy.lhs()), 01451 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01452 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01453 ); 01454 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel"); 01455 } 01456 01457 01458 // tanh 01459 template <typename T> __global__ void vec_element_tanh_kernel( 01460 T * vec1, unsigned int start1, unsigned int inc1, unsigned int size1, 01461 T const * vec2, unsigned int start2, unsigned int inc2) 01462 { 01463 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x) 01464 vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]); 01465 } 01466 01467 template <typename T> 01468 void element_op(vector_base<T> & vec1, 01469 vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_tanh> > const & proxy) 01470 { 01471 typedef T value_type; 01472 01473 vec_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01474 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01475 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01476 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01477 detail::cuda_arg<value_type>(proxy.lhs()), 01478 static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())), 01479 static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())) 01480 ); 01481 VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel"); 01482 } 01483 01484 01485 01487 01488 01489 template <typename T> 01490 __global__ void inner_prod_kernel(const T * vec1, 01491 unsigned int start1, 01492 unsigned int inc1, 01493 unsigned int size1, 01494 const T * vec2, 01495 unsigned int start2, 01496 unsigned int inc2, 01497 unsigned int size2, 01498 T * group_buffer) 01499 { 01500 __shared__ T tmp_buffer[128]; 01501 unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1; 01502 unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2; 01503 01504 unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x) 01505 - ( blockIdx.x * size1) / (gridDim.x); 01506 01507 01508 T tmp = 0; 01509 for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x) 01510 tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2]; 01511 tmp_buffer[threadIdx.x] = tmp; 01512 01513 // parallel reduction 01514 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 01515 { 01516 __syncthreads(); 01517 if (threadIdx.x < stride) 01518 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride]; 01519 } 01520 01521 if (threadIdx.x == 0) 01522 group_buffer[blockIdx.x] = tmp_buffer[0]; 01523 01524 } 01525 01526 01527 01528 // sums the array 'vec1' and writes to result. Makes use of a single work-group only. 01529 template <typename T> 01530 __global__ void vector_sum_kernel_floats( 01531 const T * vec1, 01532 unsigned int start1, 01533 unsigned int inc1, 01534 unsigned int size1, 01535 unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum 01536 T * result) 01537 { 01538 __shared__ T tmp_buffer[128]; 01539 T thread_sum = 0; 01540 for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) 01541 { 01542 if (option > 0) 01543 thread_sum += vec1[i*inc1+start1]; 01544 else 01545 thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1])); 01546 } 01547 01548 tmp_buffer[threadIdx.x] = thread_sum; 01549 01550 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 01551 { 01552 __syncthreads(); 01553 if (threadIdx.x < stride) 01554 { 01555 if (option > 0) 01556 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; 01557 else 01558 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]); 01559 } 01560 } 01561 01562 if (threadIdx.x == 0) 01563 { 01564 if (option == 2) 01565 *result = sqrt(tmp_buffer[0]); 01566 else 01567 *result = tmp_buffer[0]; 01568 } 01569 } 01570 01571 template <typename T> 01572 __global__ void vector_sum_kernel_integers( 01573 const T * vec1, 01574 unsigned int start1, 01575 unsigned int inc1, 01576 unsigned int size1, 01577 unsigned int option, //0: use max, 1: just sum 01578 T * result) 01579 { 01580 __shared__ T tmp_buffer[128]; 01581 T thread_sum = 0; 01582 for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) 01583 { 01584 if (option > 0) 01585 thread_sum += vec1[i*inc1+start1]; 01586 else 01587 thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]); 01588 } 01589 01590 tmp_buffer[threadIdx.x] = thread_sum; 01591 01592 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 01593 { 01594 __syncthreads(); 01595 if (threadIdx.x < stride) 01596 { 01597 if (option > 0) 01598 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; 01599 else 01600 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride]; 01601 } 01602 } 01603 01604 if (threadIdx.x == 0) 01605 *result = tmp_buffer[0]; 01606 } 01607 01608 template <typename T> 01609 __global__ void vector_sum_kernel_unsigned_integers( 01610 const T * vec1, 01611 unsigned int start1, 01612 unsigned int inc1, 01613 unsigned int size1, 01614 unsigned int option, //0: use max, 1: just sum 01615 T * result) 01616 { 01617 __shared__ T tmp_buffer[128]; 01618 T thread_sum = 0; 01619 for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x) 01620 { 01621 if (option > 0) 01622 thread_sum += vec1[i*inc1+start1]; 01623 else 01624 thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1]; 01625 } 01626 01627 tmp_buffer[threadIdx.x] = thread_sum; 01628 01629 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 01630 { 01631 __syncthreads(); 01632 if (threadIdx.x < stride) 01633 { 01634 if (option > 0) 01635 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; 01636 else 01637 tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride]; 01638 } 01639 } 01640 01641 if (threadIdx.x == 0) 01642 *result = tmp_buffer[0]; 01643 } 01644 01645 namespace detail 01646 { 01648 struct vector_sum_kernel_launcher_integers 01649 { 01650 template <typename T, typename S3> 01651 static void apply(vector_base<T> const & temp, 01652 unsigned int option, 01653 S3 & result) 01654 { 01655 typedef T value_type; 01656 vector_sum_kernel_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp), 01657 static_cast<unsigned int>(viennacl::traits::start(temp)), 01658 static_cast<unsigned int>(viennacl::traits::stride(temp)), 01659 static_cast<unsigned int>(viennacl::traits::size(temp)), 01660 static_cast<unsigned int>(option), 01661 detail::cuda_arg<value_type>(result) ); 01662 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); 01663 } 01664 }; 01665 01666 struct vector_sum_kernel_launcher_unsigned_integers 01667 { 01668 template <typename T, typename S3> 01669 static void apply(vector_base<T> const & temp, 01670 unsigned int option, 01671 S3 & result) 01672 { 01673 typedef T value_type; 01674 vector_sum_kernel_unsigned_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp), 01675 static_cast<unsigned int>(viennacl::traits::start(temp)), 01676 static_cast<unsigned int>(viennacl::traits::stride(temp)), 01677 static_cast<unsigned int>(viennacl::traits::size(temp)), 01678 static_cast<unsigned int>(option), 01679 detail::cuda_arg<value_type>(result) ); 01680 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); 01681 } 01682 }; 01683 01684 struct vector_sum_kernel_launcher_floats 01685 { 01686 template <typename T, typename S3> 01687 static void apply(vector_base<T> const & temp, 01688 unsigned int option, 01689 S3 & result) 01690 { 01691 typedef T value_type; 01692 vector_sum_kernel_floats<<<1, 128>>>(detail::cuda_arg<value_type>(temp), 01693 static_cast<unsigned int>(viennacl::traits::start(temp)), 01694 static_cast<unsigned int>(viennacl::traits::stride(temp)), 01695 static_cast<unsigned int>(viennacl::traits::size(temp)), 01696 static_cast<unsigned int>(option), 01697 detail::cuda_arg<value_type>(result) ); 01698 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel"); 01699 } 01700 }; 01701 01702 template <typename T> 01703 struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {}; 01704 01705 template <> 01706 struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {}; 01707 01708 template <> 01709 struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {}; 01710 01711 template <> 01712 struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {}; 01713 01714 template <> 01715 struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {}; 01716 01717 template <> 01718 struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {}; 01719 01720 template <> 01721 struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {}; 01722 01724 } 01725 01726 01727 //implementation of inner product: 01728 //namespace { 01735 template <typename T, typename S3> 01736 void inner_prod_impl(vector_base<T> const & vec1, 01737 vector_base<T> const & vec2, 01738 S3 & result) 01739 { 01740 typedef T value_type; 01741 01742 static const unsigned int work_groups = 128; 01743 static viennacl::vector<value_type> temp(work_groups); 01744 01745 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01746 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01747 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01748 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01749 detail::cuda_arg<value_type>(vec2), 01750 static_cast<unsigned int>(viennacl::traits::start(vec2)), 01751 static_cast<unsigned int>(viennacl::traits::stride(vec2)), 01752 static_cast<unsigned int>(viennacl::traits::size(vec2)), 01753 detail::cuda_arg<value_type>(temp) 01754 ); 01755 VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel"); 01756 01757 detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result); 01758 } 01759 01760 01767 template <typename T> 01768 void inner_prod_cpu(vector_base<T> const & vec1, 01769 vector_base<T> const & vec2, 01770 T & result) 01771 { 01772 typedef T value_type; 01773 01774 const unsigned int work_groups = 128; 01775 viennacl::vector<value_type> temp(work_groups); 01776 01777 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 01778 static_cast<unsigned int>(viennacl::traits::start(vec1)), 01779 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 01780 static_cast<unsigned int>(viennacl::traits::size(vec1)), 01781 detail::cuda_arg<value_type>(vec2), 01782 static_cast<unsigned int>(viennacl::traits::start(vec2)), 01783 static_cast<unsigned int>(viennacl::traits::stride(vec2)), 01784 static_cast<unsigned int>(viennacl::traits::size(vec2)), 01785 detail::cuda_arg<value_type>(temp) 01786 ); 01787 VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel"); 01788 01789 // Now copy partial results from GPU back to CPU and run reduction there: 01790 std::vector<value_type> temp_cpu(work_groups); 01791 viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin()); 01792 01793 result = 0; 01794 for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it) 01795 result += *it; 01796 } 01797 01799 01800 #define VIENNACL_MDOT_WORKGROUP_SIZE 128 01801 #define VIENNACL_MDOT_WORKGROUP_NUM 128 01802 // M = 2: 01803 template <typename NumericT> 01804 __global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, 01805 const NumericT *y0, unsigned int start0, unsigned int stride0, 01806 const NumericT *y1, unsigned int start1, unsigned int stride1, 01807 NumericT *group_results) 01808 { 01809 __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE]; 01810 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; 01811 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; 01812 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x 01813 01814 NumericT entry_x = 0; 01815 NumericT group_sum0 = 0; 01816 NumericT group_sum1 = 0; 01817 for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { 01818 entry_x = x[i * stridex + startx]; // load only once from global memory! 01819 group_sum0 += entry_x * y0[i * stride0 + start0]; 01820 group_sum1 += entry_x * y1[i * stride1 + start1]; 01821 } 01822 tmp_buffer[threadIdx.x] = group_sum0; 01823 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; 01824 01825 // parallel reduction 01826 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { 01827 __syncthreads(); 01828 if (threadIdx.x < stride) { 01829 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; 01830 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; 01831 } 01832 } 01833 01834 // write result of group to group_results 01835 if (threadIdx.x == 0) { 01836 group_results[blockIdx.x] = tmp_buffer[0]; 01837 group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x]; 01838 } 01839 } 01840 01841 // M = 3: 01842 template <typename NumericT> 01843 __global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, 01844 const NumericT *y0, unsigned int start0, unsigned int stride0, 01845 const NumericT *y1, unsigned int start1, unsigned int stride1, 01846 const NumericT *y2, unsigned int start2, unsigned int stride2, 01847 NumericT *group_results) 01848 { 01849 __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE]; 01850 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; 01851 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; 01852 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size 01853 01854 NumericT entry_x = 0; 01855 NumericT group_sum0 = 0; 01856 NumericT group_sum1 = 0; 01857 NumericT group_sum2 = 0; 01858 for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { 01859 entry_x = x[i * stridex + startx]; // load only once from global memory! 01860 group_sum0 += entry_x * y0[i * stride0 + start0]; 01861 group_sum1 += entry_x * y1[i * stride1 + start1]; 01862 group_sum2 += entry_x * y2[i * stride2 + start2]; 01863 } 01864 tmp_buffer[threadIdx.x] = group_sum0; 01865 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; 01866 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; 01867 01868 // parallel reduction 01869 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { 01870 __syncthreads(); 01871 if (threadIdx.x < stride) { 01872 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; 01873 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; 01874 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; 01875 } 01876 } 01877 01878 // write result of group to group_results 01879 if (threadIdx.x == 0) { 01880 group_results[blockIdx.x ] = tmp_buffer[0]; 01881 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; 01882 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; 01883 } 01884 } 01885 01886 // M = 4: 01887 template <typename NumericT> 01888 __global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, 01889 const NumericT *y0, unsigned int start0, unsigned int stride0, 01890 const NumericT *y1, unsigned int start1, unsigned int stride1, 01891 const NumericT *y2, unsigned int start2, unsigned int stride2, 01892 const NumericT *y3, unsigned int start3, unsigned int stride3, 01893 NumericT *group_results) 01894 { 01895 __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE]; 01896 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; 01897 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; 01898 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size 01899 01900 NumericT entry_x = 0; 01901 NumericT group_sum0 = 0; 01902 NumericT group_sum1 = 0; 01903 NumericT group_sum2 = 0; 01904 NumericT group_sum3 = 0; 01905 for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { 01906 entry_x = x[i * stridex + startx]; // load only once from global memory! 01907 group_sum0 += entry_x * y0[i * stride0 + start0]; 01908 group_sum1 += entry_x * y1[i * stride1 + start1]; 01909 group_sum2 += entry_x * y2[i * stride2 + start2]; 01910 group_sum3 += entry_x * y3[i * stride3 + start3]; 01911 } 01912 tmp_buffer[threadIdx.x] = group_sum0; 01913 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; 01914 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; 01915 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3; 01916 01917 // parallel reduction 01918 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { 01919 __syncthreads(); 01920 if (threadIdx.x < stride) { 01921 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; 01922 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; 01923 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; 01924 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x]; 01925 } 01926 } 01927 01928 // write result of group to group_results 01929 if (threadIdx.x == 0) { 01930 group_results[blockIdx.x ] = tmp_buffer[0]; 01931 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; 01932 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; 01933 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x]; 01934 } 01935 } 01936 01937 // M = 8: 01938 template <typename NumericT> 01939 __global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, 01940 const NumericT *y0, unsigned int start0, unsigned int stride0, 01941 const NumericT *y1, unsigned int start1, unsigned int stride1, 01942 const NumericT *y2, unsigned int start2, unsigned int stride2, 01943 const NumericT *y3, unsigned int start3, unsigned int stride3, 01944 const NumericT *y4, unsigned int start4, unsigned int stride4, 01945 const NumericT *y5, unsigned int start5, unsigned int stride5, 01946 const NumericT *y6, unsigned int start6, unsigned int stride6, 01947 const NumericT *y7, unsigned int start7, unsigned int stride7, 01948 NumericT *group_results) 01949 { 01950 __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE]; 01951 unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1; 01952 unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread; 01953 unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size 01954 01955 NumericT entry_x = 0; 01956 NumericT group_sum0 = 0; 01957 NumericT group_sum1 = 0; 01958 NumericT group_sum2 = 0; 01959 NumericT group_sum3 = 0; 01960 NumericT group_sum4 = 0; 01961 NumericT group_sum5 = 0; 01962 NumericT group_sum6 = 0; 01963 NumericT group_sum7 = 0; 01964 for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) { 01965 entry_x = x[i * stridex + startx]; // load only once from global memory! 01966 group_sum0 += entry_x * y0[i * stride0 + start0]; 01967 group_sum1 += entry_x * y1[i * stride1 + start1]; 01968 group_sum2 += entry_x * y2[i * stride2 + start2]; 01969 group_sum3 += entry_x * y3[i * stride3 + start3]; 01970 group_sum4 += entry_x * y4[i * stride4 + start4]; 01971 group_sum5 += entry_x * y5[i * stride5 + start5]; 01972 group_sum6 += entry_x * y6[i * stride6 + start6]; 01973 group_sum7 += entry_x * y7[i * stride7 + start7]; 01974 } 01975 tmp_buffer[threadIdx.x] = group_sum0; 01976 tmp_buffer[threadIdx.x + blockDim.x] = group_sum1; 01977 tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2; 01978 tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3; 01979 tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4; 01980 tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5; 01981 tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6; 01982 tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7; 01983 01984 // parallel reduction 01985 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) { 01986 __syncthreads(); 01987 if (threadIdx.x < stride) { 01988 tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ]; 01989 tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x]; 01990 tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x]; 01991 tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x]; 01992 tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x]; 01993 tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x]; 01994 tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x]; 01995 tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x]; 01996 } 01997 } 01998 01999 // write result of group to group_results 02000 if (threadIdx.x == 0) { 02001 group_results[blockIdx.x ] = tmp_buffer[0]; 02002 group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x]; 02003 group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x]; 02004 group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x]; 02005 group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x]; 02006 group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x]; 02007 group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x]; 02008 group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x]; 02009 } 02010 } 02011 02012 // sums the array 'vec1' and writes to result. Makes use of a single work-group only. 02013 template <typename T> 02014 __global__ void vector_multi_sum_kernel( 02015 T const * vec1, 02016 T * result, 02017 unsigned int start_result, 02018 unsigned int inc_result) 02019 { 02020 __shared__ T tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE]; 02021 02022 tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE]; 02023 02024 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02025 { 02026 __syncthreads(); 02027 if (threadIdx.x < stride) 02028 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride]; 02029 } 02030 02031 if (threadIdx.x == 0) 02032 result[start_result + inc_result * blockIdx.x] = tmp_buffer[0]; 02033 } 02034 02035 template <typename T> 02036 void inner_prod_impl(vector_base<T> const & x, 02037 vector_tuple<T> const & vec_tuple, 02038 vector_base<T> & result) 02039 { 02040 typedef T value_type; 02041 02042 static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM); 02043 02044 vcl_size_t current_index = 0; 02045 while (vec_tuple.const_size() > current_index) 02046 { 02047 switch (vec_tuple.const_size() - current_index) 02048 { 02049 case 7: 02050 case 6: 02051 case 5: 02052 case 4: 02053 { 02054 vector_base<T> const & y0 = vec_tuple.const_at(current_index); 02055 vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1); 02056 vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2); 02057 vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3); 02058 02059 inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM, 02060 VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x), 02061 static_cast<unsigned int>(viennacl::traits::start(x)), 02062 static_cast<unsigned int>(viennacl::traits::stride(x)), 02063 static_cast<unsigned int>(viennacl::traits::size(x)), 02064 detail::cuda_arg<value_type>(y0), 02065 static_cast<unsigned int>(viennacl::traits::start(y0)), 02066 static_cast<unsigned int>(viennacl::traits::stride(y0)), 02067 detail::cuda_arg<value_type>(y1), 02068 static_cast<unsigned int>(viennacl::traits::start(y1)), 02069 static_cast<unsigned int>(viennacl::traits::stride(y1)), 02070 detail::cuda_arg<value_type>(y2), 02071 static_cast<unsigned int>(viennacl::traits::start(y2)), 02072 static_cast<unsigned int>(viennacl::traits::stride(y2)), 02073 detail::cuda_arg<value_type>(y3), 02074 static_cast<unsigned int>(viennacl::traits::start(y3)), 02075 static_cast<unsigned int>(viennacl::traits::stride(y3)), 02076 detail::cuda_arg<value_type>(temp) 02077 ); 02078 VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_4_kernel"); 02079 vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp), 02080 detail::cuda_arg<value_type>(result), 02081 static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index), 02082 static_cast<unsigned int>(viennacl::traits::stride(result)) 02083 ); 02084 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel"); 02085 } 02086 current_index += 4; 02087 break; 02088 case 3: 02089 { 02090 vector_base<T> const & y0 = vec_tuple.const_at(current_index); 02091 vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1); 02092 vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2); 02093 02094 inner_prod_3_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM, 02095 VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x), 02096 static_cast<unsigned int>(viennacl::traits::start(x)), 02097 static_cast<unsigned int>(viennacl::traits::stride(x)), 02098 static_cast<unsigned int>(viennacl::traits::size(x)), 02099 detail::cuda_arg<value_type>(y0), 02100 static_cast<unsigned int>(viennacl::traits::start(y0)), 02101 static_cast<unsigned int>(viennacl::traits::stride(y0)), 02102 detail::cuda_arg<value_type>(y1), 02103 static_cast<unsigned int>(viennacl::traits::start(y1)), 02104 static_cast<unsigned int>(viennacl::traits::stride(y1)), 02105 detail::cuda_arg<value_type>(y2), 02106 static_cast<unsigned int>(viennacl::traits::start(y2)), 02107 static_cast<unsigned int>(viennacl::traits::stride(y2)), 02108 detail::cuda_arg<value_type>(temp) 02109 ); 02110 VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_3_kernel"); 02111 vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp), 02112 detail::cuda_arg<value_type>(result), 02113 static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index), 02114 static_cast<unsigned int>(viennacl::traits::stride(result)) 02115 ); 02116 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel"); 02117 } 02118 current_index += 3; 02119 break; 02120 case 2: 02121 { 02122 vector_base<T> const & y0 = vec_tuple.const_at(current_index); 02123 vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1); 02124 02125 inner_prod_2_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM, 02126 VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x), 02127 static_cast<unsigned int>(viennacl::traits::start(x)), 02128 static_cast<unsigned int>(viennacl::traits::stride(x)), 02129 static_cast<unsigned int>(viennacl::traits::size(x)), 02130 detail::cuda_arg<value_type>(y0), 02131 static_cast<unsigned int>(viennacl::traits::start(y0)), 02132 static_cast<unsigned int>(viennacl::traits::stride(y0)), 02133 detail::cuda_arg<value_type>(y1), 02134 static_cast<unsigned int>(viennacl::traits::start(y1)), 02135 static_cast<unsigned int>(viennacl::traits::stride(y1)), 02136 detail::cuda_arg<value_type>(temp) 02137 ); 02138 VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_2_kernel"); 02139 vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp), 02140 detail::cuda_arg<value_type>(result), 02141 static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index), 02142 static_cast<unsigned int>(viennacl::traits::stride(result)) 02143 ); 02144 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel"); 02145 } 02146 current_index += 2; 02147 break; 02148 case 1: 02149 { 02150 vector_base<T> const & y0 = vec_tuple.const_at(current_index); 02151 inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(x), 02152 static_cast<unsigned int>(viennacl::traits::start(x)), 02153 static_cast<unsigned int>(viennacl::traits::stride(x)), 02154 static_cast<unsigned int>(viennacl::traits::size(x)), 02155 detail::cuda_arg<value_type>(y0), 02156 static_cast<unsigned int>(viennacl::traits::start(y0)), 02157 static_cast<unsigned int>(viennacl::traits::stride(y0)), 02158 static_cast<unsigned int>(viennacl::traits::size(y0)), 02159 detail::cuda_arg<value_type>(temp) 02160 ); 02161 VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel"); 02162 02163 vector_multi_sum_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(temp), 02164 detail::cuda_arg<value_type>(result), 02165 static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index), 02166 static_cast<unsigned int>(viennacl::traits::stride(result)) 02167 ); 02168 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel"); 02169 } 02170 current_index += 1; 02171 break; 02172 02173 default: 02174 { 02175 vector_base<T> const & y0 = vec_tuple.const_at(current_index); 02176 vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1); 02177 vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2); 02178 vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3); 02179 vector_base<T> const & y4 = vec_tuple.const_at(current_index + 4); 02180 vector_base<T> const & y5 = vec_tuple.const_at(current_index + 5); 02181 vector_base<T> const & y6 = vec_tuple.const_at(current_index + 6); 02182 vector_base<T> const & y7 = vec_tuple.const_at(current_index + 7); 02183 02184 inner_prod_8_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM, 02185 VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x), 02186 static_cast<unsigned int>(viennacl::traits::start(x)), 02187 static_cast<unsigned int>(viennacl::traits::stride(x)), 02188 static_cast<unsigned int>(viennacl::traits::size(x)), 02189 detail::cuda_arg<value_type>(y0), 02190 static_cast<unsigned int>(viennacl::traits::start(y0)), 02191 static_cast<unsigned int>(viennacl::traits::stride(y0)), 02192 detail::cuda_arg<value_type>(y1), 02193 static_cast<unsigned int>(viennacl::traits::start(y1)), 02194 static_cast<unsigned int>(viennacl::traits::stride(y1)), 02195 detail::cuda_arg<value_type>(y2), 02196 static_cast<unsigned int>(viennacl::traits::start(y2)), 02197 static_cast<unsigned int>(viennacl::traits::stride(y2)), 02198 detail::cuda_arg<value_type>(y3), 02199 static_cast<unsigned int>(viennacl::traits::start(y3)), 02200 static_cast<unsigned int>(viennacl::traits::stride(y3)), 02201 detail::cuda_arg<value_type>(y4), 02202 static_cast<unsigned int>(viennacl::traits::start(y4)), 02203 static_cast<unsigned int>(viennacl::traits::stride(y4)), 02204 detail::cuda_arg<value_type>(y5), 02205 static_cast<unsigned int>(viennacl::traits::start(y5)), 02206 static_cast<unsigned int>(viennacl::traits::stride(y5)), 02207 detail::cuda_arg<value_type>(y6), 02208 static_cast<unsigned int>(viennacl::traits::start(y6)), 02209 static_cast<unsigned int>(viennacl::traits::stride(y6)), 02210 detail::cuda_arg<value_type>(y7), 02211 static_cast<unsigned int>(viennacl::traits::start(y7)), 02212 static_cast<unsigned int>(viennacl::traits::stride(y7)), 02213 detail::cuda_arg<value_type>(temp) 02214 ); 02215 VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_8_kernel"); 02216 vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp), 02217 detail::cuda_arg<value_type>(result), 02218 static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index), 02219 static_cast<unsigned int>(viennacl::traits::stride(result)) 02220 ); 02221 VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel"); 02222 } 02223 current_index += 8; 02224 break; 02225 } 02226 } 02227 } 02228 02229 #undef VIENNACL_MDOT_WORKGROUP_NUM 02230 #undef VIENNACL_MDOT_WORKGROUP_SIZE 02231 02233 02234 template <typename T> 02235 __global__ void norm_kernel_floats( 02236 const T * vec, 02237 unsigned int start1, 02238 unsigned int inc1, 02239 unsigned int size1, 02240 unsigned int norm_selector, 02241 T * group_buffer) 02242 { 02243 __shared__ T tmp_buffer[128]; 02244 02245 T tmp = 0; 02246 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1; 02247 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x; 02248 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x; 02249 group_stop = (group_stop > size1) ? size1 : group_stop; 02250 02251 if (norm_selector == 1) //norm_1 02252 { 02253 for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x) 02254 tmp += fabs(vec[i*inc1 + start1]); 02255 } 02256 else if (norm_selector == 2) //norm_2 02257 { 02258 T vec_entry = 0; 02259 for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x) 02260 { 02261 vec_entry = vec[i*inc1 + start1]; 02262 tmp += vec_entry * vec_entry; 02263 } 02264 } 02265 else if (norm_selector == 0) //norm_inf 02266 { 02267 for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x) 02268 tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); 02269 } 02270 02271 tmp_buffer[threadIdx.x] = tmp; 02272 02273 if (norm_selector > 0) //parallel reduction for norm_1 or norm_2: 02274 { 02275 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02276 { 02277 __syncthreads(); 02278 if (threadIdx.x < stride) 02279 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride]; 02280 } 02281 } 02282 else 02283 { 02284 //norm_inf: 02285 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02286 { 02287 __syncthreads(); 02288 if (threadIdx.x < stride) 02289 tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+stride]); 02290 } 02291 } 02292 02293 if (threadIdx.x == 0) 02294 group_buffer[blockIdx.x] = tmp_buffer[0]; 02295 } 02296 02297 template <typename T> 02298 __global__ void norm_kernel_integers( 02299 const T * vec, 02300 unsigned int start1, 02301 unsigned int inc1, 02302 unsigned int size1, 02303 unsigned int norm_selector, 02304 T * group_buffer) 02305 { 02306 __shared__ T tmp_buffer[128]; 02307 02308 T tmp = 0; 02309 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1; 02310 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x; 02311 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x; 02312 group_stop = (group_stop > size1) ? size1 : group_stop; 02313 02314 if (norm_selector == 1) //norm_1 02315 { 02316 for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x) 02317 tmp += abs(vec[i*inc1 + start1]); 02318 } 02319 else if (norm_selector == 0) //norm_inf 02320 { 02321 for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x) 02322 tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]); 02323 } 02324 02325 tmp_buffer[threadIdx.x] = tmp; 02326 02327 if (norm_selector > 0) //parallel reduction for norm_1 or norm_2: 02328 { 02329 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02330 { 02331 __syncthreads(); 02332 if (threadIdx.x < stride) 02333 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride]; 02334 } 02335 } 02336 else 02337 { 02338 //norm_inf: 02339 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02340 { 02341 __syncthreads(); 02342 if (threadIdx.x < stride) 02343 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride]; 02344 } 02345 } 02346 02347 if (threadIdx.x == 0) 02348 group_buffer[blockIdx.x] = tmp_buffer[0]; 02349 } 02350 02351 template <typename T> 02352 __global__ void norm_kernel_unsigned_integers( 02353 const T * vec, 02354 unsigned int start1, 02355 unsigned int inc1, 02356 unsigned int size1, 02357 unsigned int norm_selector, 02358 T * group_buffer) 02359 { 02360 __shared__ T tmp_buffer[128]; 02361 02362 T tmp = 0; 02363 unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1; 02364 unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x; 02365 unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x; 02366 group_stop = (group_stop > size1) ? size1 : group_stop; 02367 02368 if (norm_selector == 1) //norm_1 02369 { 02370 for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x) 02371 tmp += vec[i*inc1 + start1]; 02372 } 02373 else if (norm_selector == 0) //norm_inf 02374 { 02375 for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x) 02376 tmp = (tmp > vec[i*inc1 + start1]) ? tmp : vec[i*inc1 + start1]; 02377 } 02378 02379 tmp_buffer[threadIdx.x] = tmp; 02380 02381 if (norm_selector > 0) //parallel reduction for norm_1 or norm_2: 02382 { 02383 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02384 { 02385 __syncthreads(); 02386 if (threadIdx.x < stride) 02387 tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride]; 02388 } 02389 } 02390 else 02391 { 02392 //norm_inf: 02393 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02394 { 02395 __syncthreads(); 02396 if (threadIdx.x < stride) 02397 tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride]; 02398 } 02399 } 02400 02401 if (threadIdx.x == 0) 02402 group_buffer[blockIdx.x] = tmp_buffer[0]; 02403 } 02404 02406 namespace detail 02407 { 02408 struct norm_kernel_launcher_integers 02409 { 02410 template <typename T> 02411 static void apply(vector_base<T> const & vec1, 02412 vector_base<T> & temp, 02413 unsigned int option) 02414 { 02415 typedef T value_type; 02416 norm_kernel_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 02417 static_cast<unsigned int>(viennacl::traits::start(vec1)), 02418 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 02419 static_cast<unsigned int>(viennacl::traits::size(vec1)), 02420 static_cast<unsigned int>(option), 02421 detail::cuda_arg<value_type>(temp) 02422 ); 02423 VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel"); 02424 } 02425 }; 02426 02427 struct norm_kernel_launcher_unsigned_integers 02428 { 02429 template <typename T> 02430 static void apply(vector_base<T> const & vec1, 02431 vector_base<T> & temp, 02432 unsigned int option) 02433 { 02434 typedef T value_type; 02435 norm_kernel_unsigned_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 02436 static_cast<unsigned int>(viennacl::traits::start(vec1)), 02437 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 02438 static_cast<unsigned int>(viennacl::traits::size(vec1)), 02439 static_cast<unsigned int>(option), 02440 detail::cuda_arg<value_type>(temp) 02441 ); 02442 VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel"); 02443 } 02444 }; 02445 02446 02447 struct norm_kernel_launcher_floats 02448 { 02449 template <typename T> 02450 static void apply(vector_base<T> const & vec1, 02451 vector_base<T> & temp, 02452 unsigned int option) 02453 { 02454 typedef T value_type; 02455 norm_kernel_floats<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 02456 static_cast<unsigned int>(viennacl::traits::start(vec1)), 02457 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 02458 static_cast<unsigned int>(viennacl::traits::size(vec1)), 02459 static_cast<unsigned int>(option), 02460 detail::cuda_arg<value_type>(temp) 02461 ); 02462 VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel"); 02463 } 02464 }; 02465 02466 template <typename T> 02467 struct norm_kernel_launcher : public norm_kernel_launcher_integers {}; 02468 02469 template <> 02470 struct norm_kernel_launcher<unsigned char> : public norm_kernel_launcher_unsigned_integers {}; 02471 02472 template <> 02473 struct norm_kernel_launcher<unsigned short> : public norm_kernel_launcher_unsigned_integers {}; 02474 02475 template <> 02476 struct norm_kernel_launcher<unsigned int> : public norm_kernel_launcher_unsigned_integers {}; 02477 02478 template <> 02479 struct norm_kernel_launcher<unsigned long> : public norm_kernel_launcher_unsigned_integers {}; 02480 02481 template <> 02482 struct norm_kernel_launcher<float> : public norm_kernel_launcher_floats {}; 02483 02484 template <> 02485 struct norm_kernel_launcher<double> : public norm_kernel_launcher_floats {}; 02486 02487 } 02496 template <typename T> 02497 void norm_1_impl(vector_base<T> const & vec1, 02498 scalar<T> & result) 02499 { 02500 typedef T value_type; 02501 02502 vcl_size_t work_groups = 128; 02503 viennacl::vector<value_type> temp(work_groups); 02504 02505 detail::norm_kernel_launcher<T>::apply(vec1, temp, 1); 02506 detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result); 02507 } 02508 02514 template <typename T> 02515 void norm_1_cpu(vector_base<T> const & vec1, 02516 T & result) 02517 { 02518 typedef T value_type; 02519 02520 vcl_size_t work_groups = 128; 02521 viennacl::vector<value_type> temp(work_groups); 02522 02523 detail::norm_kernel_launcher<T>::apply(vec1, temp, 1); 02524 02525 // Now copy partial results from GPU back to CPU and run reduction there: 02526 std::vector<value_type> temp_cpu(work_groups); 02527 viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin()); 02528 02529 result = 0; 02530 for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it) 02531 result += *it; 02532 } 02533 02535 02541 template <typename T> 02542 void norm_2_impl(vector_base<T> const & vec1, 02543 scalar<T> & result) 02544 { 02545 typedef T value_type; 02546 02547 vcl_size_t work_groups = 128; 02548 viennacl::vector<value_type> temp(work_groups); 02549 02550 detail::norm_kernel_launcher<T>::apply(vec1, temp, 2); 02551 02552 detail::vector_sum_kernel_launcher<T>::apply(temp, 2, result); 02553 } 02554 02560 template <typename T> 02561 void norm_2_cpu(vector_base<T> const & vec1, 02562 T & result) 02563 { 02564 typedef T value_type; 02565 02566 vcl_size_t work_groups = 128; 02567 viennacl::vector<value_type> temp(work_groups); 02568 02569 detail::norm_kernel_launcher<T>::apply(vec1, temp, 2); 02570 02571 std::vector<value_type> temp_cpu(work_groups); 02572 viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin()); 02573 02574 result = 0; 02575 for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it) 02576 result += *it; 02577 result = std::sqrt(result); 02578 } 02579 02580 02582 02588 template <typename T> 02589 void norm_inf_impl(vector_base<T> const & vec1, 02590 scalar<T> & result) 02591 { 02592 typedef T value_type; 02593 02594 vcl_size_t work_groups = 128; 02595 viennacl::vector<value_type> temp(work_groups); 02596 02597 detail::norm_kernel_launcher<T>::apply(vec1, temp, 0); 02598 detail::vector_sum_kernel_launcher<T>::apply(temp, 0, result); 02599 } 02600 02601 02602 02608 template <typename T> 02609 void norm_inf_cpu(vector_base<T> const & vec1, 02610 T & result) 02611 { 02612 typedef T value_type; 02613 02614 vcl_size_t work_groups = 128; 02615 viennacl::vector<value_type> temp(work_groups); 02616 02617 detail::norm_kernel_launcher<T>::apply(vec1, temp, 0); 02618 02619 std::vector<value_type> temp_cpu(work_groups); 02620 viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin()); 02621 02622 result = 0; 02623 for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it) 02624 result = std::max(result, *it); 02625 } 02626 02627 02629 02630 02631 02632 //index_norm_inf: 02633 02634 // fixes the problem of not having (f)abs available in a consistent manner 02635 template <typename T> 02636 __device__ T cuda_abs(T val) { return (val < 0) ? -val : val; } 02637 __device__ inline unsigned long cuda_abs(unsigned long val) { return val; } 02638 __device__ inline unsigned int cuda_abs(unsigned int val) { return val; } 02639 __device__ inline unsigned short cuda_abs(unsigned short val) { return val; } 02640 __device__ inline unsigned char cuda_abs(unsigned char val) { return val; } 02641 02642 template <typename T> 02643 __global__ void index_norm_inf_kernel(const T * vec, 02644 unsigned int start1, 02645 unsigned int inc1, 02646 unsigned int size1, 02647 unsigned int * result) 02648 { 02649 __shared__ T float_buffer[128]; 02650 __shared__ unsigned int index_buffer[128]; 02651 02652 float_buffer[threadIdx.x] = 0; 02653 index_buffer[threadIdx.x] = 0; 02654 02655 //step 1: fill buffer: 02656 T cur_max = (T)0; 02657 T tmp; 02658 for (unsigned int i = threadIdx.x; i < size1; i += blockDim.x) 02659 { 02660 tmp = vec[i*inc1+start1]; 02661 tmp = cuda_abs(tmp); 02662 if (cur_max < tmp) 02663 { 02664 float_buffer[threadIdx.x] = tmp; 02665 index_buffer[threadIdx.x] = i; 02666 cur_max = tmp; 02667 } 02668 } 02669 02670 //step 2: parallel reduction: 02671 for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) 02672 { 02673 __syncthreads(); 02674 if (threadIdx.x < stride) 02675 { 02676 //find the first occurring index 02677 if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+stride]) 02678 { 02679 index_buffer[threadIdx.x] = index_buffer[threadIdx.x+stride]; 02680 float_buffer[threadIdx.x] = float_buffer[threadIdx.x+stride]; 02681 } 02682 } 02683 } 02684 02685 if (threadIdx.x == 0) 02686 *result = index_buffer[0]; 02687 } 02688 02689 //This function should return a CPU scalar, otherwise statements like 02690 // vcl_rhs[index_norm_inf(vcl_rhs)] 02691 // are ambiguous 02697 template <typename T> 02698 vcl_size_t index_norm_inf(vector_base<T> const & vec1) 02699 { 02700 typedef T value_type; 02701 02702 viennacl::backend::mem_handle h; 02703 viennacl::backend::memory_create(h, sizeof(unsigned int), viennacl::traits::context(vec1)); 02704 02705 index_norm_inf_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(vec1), 02706 static_cast<unsigned int>(viennacl::traits::start(vec1)), 02707 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 02708 static_cast<unsigned int>(viennacl::traits::size(vec1)), 02709 //detail::cuda_arg<unsigned int>(h.cuda_handle()) 02710 reinterpret_cast<unsigned int *>(h.cuda_handle().get()) 02711 ); 02712 VIENNACL_CUDA_LAST_ERROR_CHECK("index_norm_inf_kernel"); 02713 02714 unsigned int ret = 0; 02715 viennacl::backend::memory_read(h, 0, sizeof(unsigned int), &ret); 02716 return static_cast<vcl_size_t>(ret); 02717 } 02718 02720 02721 template <typename T> 02722 __global__ void plane_rotation_kernel( 02723 T * vec1, 02724 unsigned int start1, 02725 unsigned int inc1, 02726 unsigned int size1, 02727 T * vec2, 02728 unsigned int start2, 02729 unsigned int inc2, 02730 unsigned int size2, 02731 T alpha, 02732 T beta) 02733 { 02734 T tmp1 = 0; 02735 T tmp2 = 0; 02736 02737 for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x) 02738 { 02739 tmp1 = vec1[i*inc1+start1]; 02740 tmp2 = vec2[i*inc2+start2]; 02741 02742 vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; 02743 vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; 02744 } 02745 02746 } 02747 02757 template <typename T> 02758 void plane_rotation(vector_base<T> & vec1, 02759 vector_base<T> & vec2, 02760 T alpha, T beta) 02761 { 02762 typedef T value_type; 02763 02764 value_type temporary_alpha = 0; 02765 if (viennacl::is_cpu_scalar<value_type>::value) 02766 temporary_alpha = alpha; 02767 02768 value_type temporary_beta = 0; 02769 if (viennacl::is_cpu_scalar<value_type>::value) 02770 temporary_beta = beta; 02771 02772 plane_rotation_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1), 02773 static_cast<unsigned int>(viennacl::traits::start(vec1)), 02774 static_cast<unsigned int>(viennacl::traits::stride(vec1)), 02775 static_cast<unsigned int>(viennacl::traits::size(vec1)), 02776 detail::cuda_arg<value_type>(vec2), 02777 static_cast<unsigned int>(viennacl::traits::start(vec2)), 02778 static_cast<unsigned int>(viennacl::traits::stride(vec2)), 02779 static_cast<unsigned int>(viennacl::traits::size(vec2)), 02780 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)), 02781 detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)) ); 02782 VIENNACL_CUDA_LAST_ERROR_CHECK("plane_rotation_kernel"); 02783 } 02784 02785 } //namespace opencl 02786 } //namespace linalg 02787 } //namespace viennacl 02788 02789 02790 #endif