ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/cuda/vector_operations.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
00002 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include <cmath>
00026 #include "viennacl/forwards.h"
00027 #include "viennacl/scalar.hpp"
00028 #include "viennacl/tools/tools.hpp"
00029 #include "viennacl/meta/predicate.hpp"
00030 #include "viennacl/meta/enable_if.hpp"
00031 #include "viennacl/traits/size.hpp"
00032 #include "viennacl/traits/start.hpp"
00033 #include "viennacl/traits/stride.hpp"
00034 
00035 namespace viennacl
00036 {
00037   namespace linalg
00038   {
00039     namespace cuda
00040     {
00041 
00042       //
00043       // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
00044       //
00045 
00046 
00048 
00049       // gpu scalar
00050       template <typename T>
00051       __global__ void av_kernel(T * vec1,
00052                                 unsigned int start1,
00053                                 unsigned int inc1,
00054                                 unsigned int size1,
00055 
00056                                 const T * fac2,
00057                                 unsigned int options2,
00058                                 const T * vec2,
00059                                 unsigned int start2,
00060                                 unsigned int inc2)
00061       {
00062         T alpha = *fac2;
00063         if (options2 & (1 << 0))
00064           alpha = -alpha;
00065 
00066         if (options2 & (1 << 1))
00067         {
00068           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00069                             i < size1;
00070                             i += gridDim.x * blockDim.x)
00071             vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
00072         }
00073         else
00074         {
00075           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00076                             i < size1;
00077                             i += gridDim.x * blockDim.x)
00078             vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
00079         }
00080       }
00081 
00082       // cpu scalar
00083       template <typename T>
00084       __global__ void av_kernel(T * vec1,
00085                                 unsigned int start1,
00086                                 unsigned int inc1,
00087                                 unsigned int size1,
00088 
00089                                 T fac2,
00090                                 unsigned int options2,
00091                                 const T * vec2,
00092                                 unsigned int start2,
00093                                 unsigned int inc2)
00094       {
00095         T alpha = fac2;
00096         if (options2 & (1 << 0))
00097           alpha = -alpha;
00098 
00099         if (options2 & (1 << 1))
00100         {
00101           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00102                             i < size1;
00103                             i += gridDim.x * blockDim.x)
00104             vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
00105         }
00106         else
00107         {
00108           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00109                             i < size1;
00110                             i += gridDim.x * blockDim.x)
00111             vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
00112         }
00113       }
00114 
00115 
00116 
00117       template <typename T, typename ScalarType1>
00118       void av(vector_base<T> & vec1,
00119               vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
00120       {
00121         typedef T        value_type;
00122 
00123         unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00124 
00125         value_type data_alpha = alpha;
00126         if (flip_sign_alpha)
00127           data_alpha = -data_alpha;
00128         if (reciprocal_alpha)
00129           data_alpha = static_cast<value_type>(1) / data_alpha;
00130 
00131         value_type temporary_alpha = 0;
00132         if (viennacl::is_cpu_scalar<ScalarType1>::value)
00133           temporary_alpha = alpha;
00134 
00135         av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
00136                                 static_cast<unsigned int>(viennacl::traits::start(vec1)),
00137                                 static_cast<unsigned int>(viennacl::traits::stride(vec1)),
00138                                 static_cast<unsigned int>(viennacl::traits::size(vec1)),
00139 
00140                                 detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00141                                 options_alpha,
00142                                 detail::cuda_arg<value_type>(vec2),
00143                                 static_cast<unsigned int>(viennacl::traits::start(vec2)),
00144                                 static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
00145         VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
00146       }
00147 
00148 
00150 
00151       // alpha and beta on GPU
00152       template <typename T>
00153       __global__ void avbv_kernel(T * vec1,
00154                                   unsigned int start1,
00155                                   unsigned int inc1,
00156                                   unsigned int size1,
00157 
00158                                   const T * fac2,
00159                                   unsigned int options2,
00160                                   const T * vec2,
00161                                   unsigned int start2,
00162                                   unsigned int inc2,
00163 
00164                                   const T * fac3,
00165                                   unsigned int options3,
00166                                   const T * vec3,
00167                                   unsigned int start3,
00168                                   unsigned int inc3)
00169       {
00170         T alpha = *fac2;
00171         if (options2 & (1 << 0))
00172           alpha = -alpha;
00173 
00174         T beta = *fac3;
00175         if (options3 & (1 << 0))
00176           beta = -beta;
00177 
00178         if (options2 & (1 << 1))
00179         {
00180           if (options3 & (1 << 1))
00181           {
00182             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00183                               i < size1;
00184                               i += gridDim.x * blockDim.x)
00185               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00186           }
00187           else
00188           {
00189             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00190                               i < size1;
00191                               i += gridDim.x * blockDim.x)
00192               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00193           }
00194         }
00195         else
00196         {
00197           if (options3 & (1 << 1))
00198           {
00199             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00200                               i < size1;
00201                               i += gridDim.x * blockDim.x)
00202               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00203           }
00204           else
00205           {
00206             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00207                               i < size1;
00208                               i += gridDim.x * blockDim.x)
00209               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00210           }
00211         }
00212       }
00213 
00214       // alpha on CPU, beta on GPU
00215       template <typename T>
00216       __global__ void avbv_kernel(T * vec1,
00217                                   unsigned int start1,
00218                                   unsigned int inc1,
00219                                   unsigned int size1,
00220 
00221                                   T fac2,
00222                                   unsigned int options2,
00223                                   const T * vec2,
00224                                   unsigned int start2,
00225                                   unsigned int inc2,
00226 
00227                                   const T * fac3,
00228                                   unsigned int options3,
00229                                   const T * vec3,
00230                                   unsigned int start3,
00231                                   unsigned int inc3)
00232       {
00233         T alpha = fac2;
00234         if (options2 & (1 << 0))
00235           alpha = -alpha;
00236 
00237         T beta = *fac3;
00238         if (options3 & (1 << 0))
00239           beta = -beta;
00240 
00241         if (options2 & (1 << 1))
00242         {
00243           if (options3 & (1 << 1))
00244           {
00245             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00246                               i < size1;
00247                               i += gridDim.x * blockDim.x)
00248               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00249           }
00250           else
00251           {
00252             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00253                               i < size1;
00254                               i += gridDim.x * blockDim.x)
00255               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00256           }
00257         }
00258         else
00259         {
00260           if (options3 & (1 << 1))
00261           {
00262             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00263                               i < size1;
00264                               i += gridDim.x * blockDim.x)
00265               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00266           }
00267           else
00268           {
00269             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00270                               i < size1;
00271                               i += gridDim.x * blockDim.x)
00272               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00273           }
00274         }
00275       }
00276 
00277       // alpha on GPU, beta on CPU
00278       template <typename T>
00279       __global__ void avbv_kernel(T * vec1,
00280                                   unsigned int start1,
00281                                   unsigned int inc1,
00282                                   unsigned int size1,
00283 
00284                                   const T * fac2,
00285                                   unsigned int options2,
00286                                   const T * vec2,
00287                                   unsigned int start2,
00288                                   unsigned int inc2,
00289 
00290                                   T fac3,
00291                                   unsigned int options3,
00292                                   const T * vec3,
00293                                   unsigned int start3,
00294                                   unsigned int inc3)
00295       {
00296         T alpha = *fac2;
00297         if (options2 & (1 << 0))
00298           alpha = -alpha;
00299 
00300         T beta = fac3;
00301         if (options3 & (1 << 0))
00302           beta = -beta;
00303 
00304         if (options2 & (1 << 1))
00305         {
00306           if (options3 & (1 << 1))
00307           {
00308             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00309                               i < size1;
00310                               i += gridDim.x * blockDim.x)
00311               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00312           }
00313           else
00314           {
00315             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00316                               i < size1;
00317                               i += gridDim.x * blockDim.x)
00318               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00319           }
00320         }
00321         else
00322         {
00323           if (options3 & (1 << 1))
00324           {
00325             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00326                               i < size1;
00327                               i += gridDim.x * blockDim.x)
00328               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00329           }
00330           else
00331           {
00332             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00333                               i < size1;
00334                               i += gridDim.x * blockDim.x)
00335               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00336           }
00337         }
00338       }
00339 
00340       // alpha and beta on CPU
00341       template <typename T>
00342       __global__ void avbv_kernel(T * vec1,
00343                                   unsigned int start1,
00344                                   unsigned int inc1,
00345                                   unsigned int size1,
00346 
00347                                   T fac2,
00348                                   unsigned int options2,
00349                                   const T * vec2,
00350                                   unsigned int start2,
00351                                   unsigned int inc2,
00352 
00353                                   T fac3,
00354                                   unsigned int options3,
00355                                   const T * vec3,
00356                                   unsigned int start3,
00357                                   unsigned int inc3)
00358       {
00359         T alpha = fac2;
00360         if (options2 & (1 << 0))
00361           alpha = -alpha;
00362 
00363         T beta = fac3;
00364         if (options3 & (1 << 0))
00365           beta = -beta;
00366 
00367         if (options2 & (1 << 1))
00368         {
00369           if (options3 & (1 << 1))
00370           {
00371             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00372                               i < size1;
00373                               i += gridDim.x * blockDim.x)
00374               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00375           }
00376           else
00377           {
00378             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00379                               i < size1;
00380                               i += gridDim.x * blockDim.x)
00381               vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00382           }
00383         }
00384         else
00385         {
00386           if (options3 & (1 << 1))
00387           {
00388             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00389                               i < size1;
00390                               i += gridDim.x * blockDim.x)
00391               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00392           }
00393           else
00394           {
00395             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00396                               i < size1;
00397                               i += gridDim.x * blockDim.x)
00398               vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00399           }
00400         }
00401       }
00402 
00403 
00404 
00405 
00406       template <typename T, typename ScalarType1, typename ScalarType2>
00407       void avbv(vector_base<T> & vec1,
00408                 vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00409                 vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00410       {
00411         typedef T        value_type;
00412 
00413         unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00414 
00415         value_type data_alpha = alpha;
00416         if (flip_sign_alpha)
00417           data_alpha = -data_alpha;
00418         if (reciprocal_alpha)
00419           data_alpha = static_cast<value_type>(1) / data_alpha;
00420 
00421         value_type temporary_alpha = 0;
00422         if (viennacl::is_cpu_scalar<ScalarType1>::value)
00423           temporary_alpha = alpha;
00424 
00425         unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00426 
00427         value_type temporary_beta = 0;
00428         if (viennacl::is_cpu_scalar<ScalarType2>::value)
00429           temporary_beta = beta;
00430 
00431 
00432         avbv_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
00433                                   static_cast<unsigned int>(viennacl::traits::start(vec1)),
00434                                   static_cast<unsigned int>(viennacl::traits::stride(vec1)),
00435                                   static_cast<unsigned int>(viennacl::traits::size(vec1)),
00436 
00437                                   detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00438                                   options_alpha,
00439                                   detail::cuda_arg<value_type>(vec2),
00440                                   static_cast<unsigned int>(viennacl::traits::start(vec2)),
00441                                   static_cast<unsigned int>(viennacl::traits::stride(vec2)),
00442 
00443                                   detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
00444                                   options_beta,
00445                                   detail::cuda_arg<value_type>(vec3),
00446                                   static_cast<unsigned int>(viennacl::traits::start(vec3)),
00447                                   static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
00448         VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
00449       }
00450 
00451 
00453 
00454 
00455       // alpha and beta on GPU
00456       template <typename T>
00457       __global__ void avbv_v_kernel(T * vec1,
00458                                     unsigned int start1,
00459                                     unsigned int inc1,
00460                                     unsigned int size1,
00461 
00462                                     const T * fac2,
00463                                     unsigned int options2,
00464                                     const T * vec2,
00465                                     unsigned int start2,
00466                                     unsigned int inc2,
00467 
00468                                     const T * fac3,
00469                                     unsigned int options3,
00470                                     const T * vec3,
00471                                     unsigned int start3,
00472                                     unsigned int inc3)
00473       {
00474         T alpha = *fac2;
00475         if (options2 & (1 << 0))
00476           alpha = -alpha;
00477 
00478         T beta = *fac3;
00479         if (options3 & (1 << 0))
00480           beta = -beta;
00481 
00482         if (options2 & (1 << 1))
00483         {
00484           if (options3 & (1 << 1))
00485           {
00486             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00487                               i < size1;
00488                               i += gridDim.x * blockDim.x)
00489               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00490           }
00491           else
00492           {
00493             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00494                               i < size1;
00495                               i += gridDim.x * blockDim.x)
00496               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00497           }
00498         }
00499         else
00500         {
00501           if (options3 & (1 << 1))
00502           {
00503             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00504                               i < size1;
00505                               i += gridDim.x * blockDim.x)
00506               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00507           }
00508           else
00509           {
00510             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00511                               i < size1;
00512                               i += gridDim.x * blockDim.x)
00513               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00514           }
00515         }
00516       }
00517 
00518       // alpha on CPU, beta on GPU
00519       template <typename T>
00520       __global__ void avbv_v_kernel(T * vec1,
00521                                     unsigned int start1,
00522                                     unsigned int inc1,
00523                                     unsigned int size1,
00524 
00525                                     T fac2,
00526                                     unsigned int options2,
00527                                     const T * vec2,
00528                                     unsigned int start2,
00529                                     unsigned int inc2,
00530 
00531                                     const T * fac3,
00532                                     unsigned int options3,
00533                                     const T * vec3,
00534                                     unsigned int start3,
00535                                     unsigned int inc3)
00536       {
00537         T alpha = fac2;
00538         if (options2 & (1 << 0))
00539           alpha = -alpha;
00540 
00541         T beta = *fac3;
00542         if (options3 & (1 << 0))
00543           beta = -beta;
00544 
00545         if (options2 & (1 << 1))
00546         {
00547           if (options3 & (1 << 1))
00548           {
00549             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00550                               i < size1;
00551                               i += gridDim.x * blockDim.x)
00552               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00553           }
00554           else
00555           {
00556             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00557                               i < size1;
00558                               i += gridDim.x * blockDim.x)
00559               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00560           }
00561         }
00562         else
00563         {
00564           if (options3 & (1 << 1))
00565           {
00566             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00567                               i < size1;
00568                               i += gridDim.x * blockDim.x)
00569               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00570           }
00571           else
00572           {
00573             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00574                               i < size1;
00575                               i += gridDim.x * blockDim.x)
00576               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00577           }
00578         }
00579       }
00580 
00581       // alpha on GPU, beta on CPU
00582       template <typename T>
00583       __global__ void avbv_v_kernel(T * vec1,
00584                                     unsigned int start1,
00585                                     unsigned int inc1,
00586                                     unsigned int size1,
00587 
00588                                     const T * fac2,
00589                                     unsigned int options2,
00590                                     const T * vec2,
00591                                     unsigned int start2,
00592                                     unsigned int inc2,
00593 
00594                                     T fac3,
00595                                     unsigned int options3,
00596                                     const T * vec3,
00597                                     unsigned int start3,
00598                                     unsigned int inc3)
00599       {
00600         T alpha = *fac2;
00601         if (options2 & (1 << 0))
00602           alpha = -alpha;
00603 
00604         T beta = fac3;
00605         if (options3 & (1 << 0))
00606           beta = -beta;
00607 
00608         if (options2 & (1 << 1))
00609         {
00610           if (options3 & (1 << 1))
00611           {
00612             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00613                               i < size1;
00614                               i += gridDim.x * blockDim.x)
00615               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00616           }
00617           else
00618           {
00619             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00620                               i < size1;
00621                               i += gridDim.x * blockDim.x)
00622               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00623           }
00624         }
00625         else
00626         {
00627           if (options3 & (1 << 1))
00628           {
00629             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00630                               i < size1;
00631                               i += gridDim.x * blockDim.x)
00632               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00633           }
00634           else
00635           {
00636             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00637                               i < size1;
00638                               i += gridDim.x * blockDim.x)
00639               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00640           }
00641         }
00642       }
00643 
00644       // alpha and beta on CPU
00645       template <typename T>
00646       __global__ void avbv_v_kernel(T * vec1,
00647                                     unsigned int start1,
00648                                     unsigned int inc1,
00649                                     unsigned int size1,
00650 
00651                                     T fac2,
00652                                     unsigned int options2,
00653                                     const T * vec2,
00654                                     unsigned int start2,
00655                                     unsigned int inc2,
00656 
00657                                     T fac3,
00658                                     unsigned int options3,
00659                                     const T * vec3,
00660                                     unsigned int start3,
00661                                     unsigned int inc3)
00662       {
00663         T alpha = fac2;
00664         if (options2 & (1 << 0))
00665           alpha = -alpha;
00666 
00667         T beta = fac3;
00668         if (options3 & (1 << 0))
00669           beta = -beta;
00670 
00671         if (options2 & (1 << 1))
00672         {
00673           if (options3 & (1 << 1))
00674           {
00675             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00676                               i < size1;
00677                               i += gridDim.x * blockDim.x)
00678               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
00679           }
00680           else
00681           {
00682             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00683                               i < size1;
00684                               i += gridDim.x * blockDim.x)
00685               vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
00686           }
00687         }
00688         else
00689         {
00690           if (options3 & (1 << 1))
00691           {
00692             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00693                               i < size1;
00694                               i += gridDim.x * blockDim.x)
00695               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
00696           }
00697           else
00698           {
00699             for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00700                               i < size1;
00701                               i += gridDim.x * blockDim.x)
00702               vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
00703           }
00704         }
00705       }
00706 
00707 
00708       template <typename T, typename ScalarType1, typename ScalarType2>
00709       void avbv_v(vector_base<T> & vec1,
00710                   vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00711                   vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00712       {
00713         typedef T        value_type;
00714 
00715         unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00716 
00717         value_type data_alpha = alpha;
00718         if (flip_sign_alpha)
00719           data_alpha = -data_alpha;
00720         if (reciprocal_alpha)
00721           data_alpha = static_cast<value_type>(1) / data_alpha;
00722 
00723         value_type temporary_alpha = 0;
00724         if (viennacl::is_cpu_scalar<ScalarType1>::value)
00725           temporary_alpha = alpha;
00726 
00727         unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00728 
00729         value_type temporary_beta = 0;
00730         if (viennacl::is_cpu_scalar<ScalarType2>::value)
00731           temporary_beta = beta;
00732 
00733 
00734         avbv_v_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
00735                                     static_cast<unsigned int>(viennacl::traits::start(vec1)),
00736                                     static_cast<unsigned int>(viennacl::traits::stride(vec1)),
00737                                     static_cast<unsigned int>(viennacl::traits::size(vec1)),
00738 
00739                                     detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00740                                     options_alpha,
00741                                     detail::cuda_arg<value_type>(vec2),
00742                                     static_cast<unsigned int>(viennacl::traits::start(vec2)),
00743                                     static_cast<unsigned int>(viennacl::traits::stride(vec2)),
00744 
00745                                     detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
00746                                     options_beta,
00747                                     detail::cuda_arg<value_type>(vec3),
00748                                     static_cast<unsigned int>(viennacl::traits::start(vec3)),
00749                                     static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
00750       }
00751 
00752 
00754 
00755       template <typename T>
00756       __global__ void vector_assign_kernel(T * vec1,
00757                                            unsigned int start1,
00758                                            unsigned int inc1,
00759                                            unsigned int size1,
00760                                            unsigned int internal_size1,
00761 
00762                                            T alpha)
00763       {
00764         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00765                           i < size1;
00766                           i += gridDim.x * blockDim.x)
00767           vec1[i*inc1+start1] =  (i < size1) ? alpha : 0;
00768       }
00769 
00776       template <typename T, typename S1>
00777       void vector_assign(vector_base<T> & vec1, const S1 & alpha, bool up_to_internal_size = false)
00778       {
00779         typedef T        value_type;
00780 
00781         value_type temporary_alpha = 0;
00782         if (viennacl::is_cpu_scalar<S1>::value)
00783           temporary_alpha = alpha;
00784 
00785         unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
00786 
00787         vector_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
00788                                            static_cast<unsigned int>(viennacl::traits::start(vec1)),
00789                                            static_cast<unsigned int>(viennacl::traits::stride(vec1)),
00790                                            size,
00791                                            static_cast<unsigned int>(vec1.internal_size()),  //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
00792 
00793                                            detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
00794         VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_v_kernel");
00795       }
00796 
00798 
00799       template <typename T>
00800       __global__ void vector_swap_kernel(T * vec1,
00801                                          unsigned int start1,
00802                                          unsigned int inc1,
00803                                          unsigned int size1,
00804 
00805                                          T * vec2,
00806                                          unsigned int start2,
00807                                          unsigned int inc2)
00808       {
00809         T tmp;
00810         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00811                           i < size1;
00812                           i += gridDim.x * blockDim.x)
00813         {
00814           tmp = vec2[i*inc2+start2];
00815           vec2[i*inc2+start2] = vec1[i*inc1+start1];
00816           vec1[i*inc1+start1] = tmp;
00817         }
00818       }
00819 
00820 
00826       template <typename T>
00827       void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
00828       {
00829         typedef T      value_type;
00830 
00831         vector_swap_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
00832                                          static_cast<unsigned int>(viennacl::traits::start(vec1)),
00833                                          static_cast<unsigned int>(viennacl::traits::stride(vec1)),
00834                                          static_cast<unsigned int>(viennacl::traits::size(vec1)),
00835 
00836                                          detail::cuda_arg<value_type>(vec2),
00837                                          static_cast<unsigned int>(viennacl::traits::start(vec2)),
00838                                          static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
00839         VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
00840       }
00841 
00843 
00844       template <typename T>
00845       __global__ void element_op_kernel(T * vec1,
00846                                          unsigned int start1,
00847                                          unsigned int inc1,
00848                                          unsigned int size1,
00849 
00850                                          T const * vec2,
00851                                          unsigned int start2,
00852                                          unsigned int inc2,
00853 
00854                                          T const * vec3,
00855                                          unsigned int start3,
00856                                          unsigned int inc3,
00857 
00858                                          unsigned int op_type
00859                                        )
00860       {
00861         if (op_type == 2)
00862         {
00863           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00864                             i < size1;
00865                             i += gridDim.x * blockDim.x)
00866           {
00867             vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
00868           }
00869         }
00870         else if (op_type == 1)
00871         {
00872           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00873                             i < size1;
00874                             i += gridDim.x * blockDim.x)
00875           {
00876             vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
00877           }
00878         }
00879         else if (op_type == 0)
00880         {
00881           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00882                             i < size1;
00883                             i += gridDim.x * blockDim.x)
00884           {
00885             vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
00886           }
00887         }
00888       }
00889 
00890       template <typename T>
00891       __global__ void element_op_int_kernel(T * vec1,
00892                                          unsigned int start1,
00893                                          unsigned int inc1,
00894                                          unsigned int size1,
00895 
00896                                          T const * vec2,
00897                                          unsigned int start2,
00898                                          unsigned int inc2,
00899 
00900                                          T const * vec3,
00901                                          unsigned int start3,
00902                                          unsigned int inc3,
00903 
00904                                          unsigned int op_type
00905                                        )
00906       {
00907         if (op_type == 1)
00908         {
00909           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00910                             i < size1;
00911                             i += gridDim.x * blockDim.x)
00912           {
00913             vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
00914           }
00915         }
00916         else if (op_type == 0)
00917         {
00918           for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
00919                             i < size1;
00920                             i += gridDim.x * blockDim.x)
00921           {
00922             vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
00923           }
00924         }
00925       }
00926 
00932       template <typename T, typename OP>
00933       void element_op(vector_base<T> & vec1,
00934                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
00935       {
00936         typedef T        value_type;
00937 
00938         unsigned int op_type = 2; //0: product, 1: division, 2: power
00939         if (viennacl::is_division<OP>::value)
00940           op_type = 1;
00941         else if (viennacl::is_product<OP>::value)
00942           op_type = 0;
00943 
00944         element_op_int_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
00945                                         static_cast<unsigned int>(viennacl::traits::start(vec1)),
00946                                         static_cast<unsigned int>(viennacl::traits::stride(vec1)),
00947                                         static_cast<unsigned int>(viennacl::traits::size(vec1)),
00948 
00949                                         detail::cuda_arg<value_type>(proxy.lhs()),
00950                                         static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
00951                                         static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
00952 
00953                                         detail::cuda_arg<value_type>(proxy.rhs()),
00954                                         static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
00955                                         static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
00956 
00957                                         op_type
00958                                        );
00959         VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
00960       }
00961 
00962       template <typename OP>
00963       void element_op(vector_base<float> & vec1,
00964                       vector_expression<const vector_base<float>, const vector_base<float>, op_element_binary<OP> > const & proxy)
00965       {
00966         typedef float        value_type;
00967 
00968         unsigned int op_type = 2; //0: product, 1: division, 2: power
00969         if (viennacl::is_division<OP>::value)
00970           op_type = 1;
00971         else if (viennacl::is_product<OP>::value)
00972           op_type = 0;
00973 
00974         element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
00975                                         static_cast<unsigned int>(viennacl::traits::start(vec1)),
00976                                         static_cast<unsigned int>(viennacl::traits::stride(vec1)),
00977                                         static_cast<unsigned int>(viennacl::traits::size(vec1)),
00978 
00979                                         detail::cuda_arg<value_type>(proxy.lhs()),
00980                                         static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
00981                                         static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
00982 
00983                                         detail::cuda_arg<value_type>(proxy.rhs()),
00984                                         static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
00985                                         static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
00986 
00987                                         op_type
00988                                        );
00989         VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
00990       }
00991 
00992       template <typename OP>
00993       void element_op(vector_base<double> & vec1,
00994                       vector_expression<const vector_base<double>, const vector_base<double>, op_element_binary<OP> > const & proxy)
00995       {
00996         typedef double        value_type;
00997 
00998         unsigned int op_type = 2; //0: product, 1: division, 2: power
00999         if (viennacl::is_division<OP>::value)
01000           op_type = 1;
01001         else if (viennacl::is_product<OP>::value)
01002           op_type = 0;
01003 
01004         element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01005                                         static_cast<unsigned int>(viennacl::traits::start(vec1)),
01006                                         static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01007                                         static_cast<unsigned int>(viennacl::traits::size(vec1)),
01008 
01009                                         detail::cuda_arg<value_type>(proxy.lhs()),
01010                                         static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01011                                         static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
01012 
01013                                         detail::cuda_arg<value_type>(proxy.rhs()),
01014                                         static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
01015                                         static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
01016 
01017                                         op_type
01018                                        );
01019         VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
01020       }
01021 
01023 
01024 // Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
01025 // Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
01026 
01027       // acos
01028       template <typename T> __global__ void vec_element_acos_kernel(
01029           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01030           T const * vec2, unsigned int start2, unsigned int inc2)
01031       {
01032         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01033           vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
01034       }
01035 
01036       template <typename T>
01037       void element_op(vector_base<T> & vec1,
01038                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_acos> > const & proxy)
01039       {
01040         typedef T        value_type;
01041 
01042         vec_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01043                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01044                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01045                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01046                                               detail::cuda_arg<value_type>(proxy.lhs()),
01047                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01048                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01049                                              );
01050         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
01051       }
01052 
01053       // asin
01054       template <typename T> __global__ void vec_element_asin_kernel(
01055           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01056           T const * vec2, unsigned int start2, unsigned int inc2)
01057       {
01058         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01059           vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
01060       }
01061 
01062       template <typename T>
01063       void element_op(vector_base<T> & vec1,
01064                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_asin> > const & proxy)
01065       {
01066         typedef T        value_type;
01067 
01068         vec_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01069                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01070                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01071                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01072                                               detail::cuda_arg<value_type>(proxy.lhs()),
01073                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01074                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01075                                              );
01076         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
01077       }
01078 
01079 
01080       // atan
01081       template <typename T> __global__ void vec_element_atan_kernel(
01082           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01083           T const * vec2, unsigned int start2, unsigned int inc2)
01084       {
01085         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01086           vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
01087       }
01088 
01089       template <typename T>
01090       void element_op(vector_base<T> & vec1,
01091                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_atan> > const & proxy)
01092       {
01093         typedef T        value_type;
01094 
01095         vec_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01096                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01097                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01098                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01099                                               detail::cuda_arg<value_type>(proxy.lhs()),
01100                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01101                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01102                                              );
01103         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
01104       }
01105 
01106 
01107       // ceil
01108       template <typename T> __global__ void vec_element_ceil_kernel(
01109           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01110           T const * vec2, unsigned int start2, unsigned int inc2)
01111       {
01112         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01113           vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
01114       }
01115 
01116       template <typename T>
01117       void element_op(vector_base<T> & vec1,
01118                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_ceil> > const & proxy)
01119       {
01120         typedef T        value_type;
01121 
01122         vec_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01123                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01124                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01125                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01126                                               detail::cuda_arg<value_type>(proxy.lhs()),
01127                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01128                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01129                                              );
01130         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
01131       }
01132 
01133 
01134       // cos
01135       template <typename T> __global__ void vec_element_cos_kernel(
01136           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01137           T const * vec2, unsigned int start2, unsigned int inc2)
01138       {
01139         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01140           vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
01141       }
01142 
01143       template <typename T>
01144       void element_op(vector_base<T> & vec1,
01145                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_cos> > const & proxy)
01146       {
01147         typedef T        value_type;
01148 
01149         vec_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01150                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01151                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01152                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01153                                               detail::cuda_arg<value_type>(proxy.lhs()),
01154                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01155                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01156                                              );
01157         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
01158       }
01159 
01160 
01161       // cosh
01162       template <typename T> __global__ void vec_element_cosh_kernel(
01163           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01164           T const * vec2, unsigned int start2, unsigned int inc2)
01165       {
01166         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01167           vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
01168       }
01169 
01170       template <typename T>
01171       void element_op(vector_base<T> & vec1,
01172                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_cosh> > const & proxy)
01173       {
01174         typedef T        value_type;
01175 
01176         vec_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01177                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01178                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01179                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01180                                               detail::cuda_arg<value_type>(proxy.lhs()),
01181                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01182                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01183                                              );
01184         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
01185       }
01186 
01187 
01188       // exp
01189       template <typename T> __global__ void vec_element_exp_kernel(
01190           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01191           T const * vec2, unsigned int start2, unsigned int inc2)
01192       {
01193         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01194           vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
01195       }
01196 
01197       template <typename T>
01198       void element_op(vector_base<T> & vec1,
01199                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_exp> > const & proxy)
01200       {
01201         typedef T        value_type;
01202 
01203         vec_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01204                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01205                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01206                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01207                                               detail::cuda_arg<value_type>(proxy.lhs()),
01208                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01209                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01210                                              );
01211         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
01212       }
01213 
01214 
01215       // fabs
01216       template <typename T> __global__ void vec_element_fabs_kernel(
01217           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01218           T const * vec2, unsigned int start2, unsigned int inc2)
01219       {
01220         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01221           vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
01222       }
01223 
01224       template <typename T>
01225       void element_op(vector_base<T> & vec1,
01226                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_fabs> > const & proxy)
01227       {
01228         typedef T        value_type;
01229 
01230         vec_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01231                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01232                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01233                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01234                                               detail::cuda_arg<value_type>(proxy.lhs()),
01235                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01236                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01237                                              );
01238         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
01239       }
01240 
01241       // abs
01242       template <typename T> __global__ void vec_element_abs_kernel(
01243           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01244           T const * vec2, unsigned int start2, unsigned int inc2)
01245       {
01246         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01247           vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
01248       }
01249 
01250       template <typename T>
01251       void element_op(vector_base<T> & vec1,
01252                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_abs> > const & proxy)
01253       {
01254         typedef T        value_type;
01255 
01256         vec_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01257                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
01258                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01259                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
01260                                              detail::cuda_arg<value_type>(proxy.lhs()),
01261                                              static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01262                                              static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01263                                             );
01264         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
01265       }
01266 
01267 
01268 
01269       // floor
01270       template <typename T> __global__ void vec_element_floor_kernel(
01271           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01272           T const * vec2, unsigned int start2, unsigned int inc2)
01273       {
01274         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01275           vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
01276       }
01277 
01278       template <typename T>
01279       void element_op(vector_base<T> & vec1,
01280                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_floor> > const & proxy)
01281       {
01282         typedef T        value_type;
01283 
01284         vec_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01285                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01286                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01287                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01288                                               detail::cuda_arg<value_type>(proxy.lhs()),
01289                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01290                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01291                                              );
01292         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
01293       }
01294 
01295 
01296       // log
01297       template <typename T> __global__ void vec_element_log_kernel(
01298           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01299           T const * vec2, unsigned int start2, unsigned int inc2)
01300       {
01301         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01302           vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
01303       }
01304 
01305       template <typename T>
01306       void element_op(vector_base<T> & vec1,
01307                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_log> > const & proxy)
01308       {
01309         typedef T        value_type;
01310 
01311         vec_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01312                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01313                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01314                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01315                                               detail::cuda_arg<value_type>(proxy.lhs()),
01316                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01317                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01318                                              );
01319         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
01320       }
01321 
01322 
01323       // log10
01324       template <typename T> __global__ void vec_element_log10_kernel(
01325           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01326           T const * vec2, unsigned int start2, unsigned int inc2)
01327       {
01328         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01329           vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
01330       }
01331 
01332       template <typename T>
01333       void element_op(vector_base<T> & vec1,
01334                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_log10> > const & proxy)
01335       {
01336         typedef T        value_type;
01337 
01338         vec_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01339                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01340                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01341                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01342                                               detail::cuda_arg<value_type>(proxy.lhs()),
01343                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01344                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01345                                              );
01346         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
01347       }
01348 
01349 
01350       // sin
01351       template <typename T> __global__ void vec_element_sin_kernel(
01352           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01353           T const * vec2, unsigned int start2, unsigned int inc2)
01354       {
01355         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01356           vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
01357       }
01358 
01359       template <typename T>
01360       void element_op(vector_base<T> & vec1,
01361                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sin> > const & proxy)
01362       {
01363         typedef T        value_type;
01364 
01365         vec_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01366                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01367                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01368                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01369                                               detail::cuda_arg<value_type>(proxy.lhs()),
01370                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01371                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01372                                              );
01373         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
01374       }
01375 
01376 
01377       // sinh
01378       template <typename T> __global__ void vec_element_sinh_kernel(
01379           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01380           T const * vec2, unsigned int start2, unsigned int inc2)
01381       {
01382         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01383           vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
01384       }
01385 
01386       template <typename T>
01387       void element_op(vector_base<T> & vec1,
01388                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sinh> > const & proxy)
01389       {
01390         typedef T        value_type;
01391 
01392         vec_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01393                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01394                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01395                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01396                                               detail::cuda_arg<value_type>(proxy.lhs()),
01397                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01398                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01399                                              );
01400         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
01401       }
01402 
01403 
01404       // sqrt
01405       template <typename T> __global__ void vec_element_sqrt_kernel(
01406           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01407           T const * vec2, unsigned int start2, unsigned int inc2)
01408       {
01409         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01410           vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
01411       }
01412 
01413       template <typename T>
01414       void element_op(vector_base<T> & vec1,
01415                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_sqrt> > const & proxy)
01416       {
01417         typedef T        value_type;
01418 
01419         vec_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01420                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01421                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01422                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01423                                               detail::cuda_arg<value_type>(proxy.lhs()),
01424                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01425                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01426                                              );
01427         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
01428       }
01429 
01430 
01431       // tan
01432       template <typename T> __global__ void vec_element_tan_kernel(
01433           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01434           T const * vec2, unsigned int start2, unsigned int inc2)
01435       {
01436         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01437           vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
01438       }
01439 
01440       template <typename T>
01441       void element_op(vector_base<T> & vec1,
01442                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_tan> > const & proxy)
01443       {
01444         typedef T        value_type;
01445 
01446         vec_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01447                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01448                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01449                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01450                                               detail::cuda_arg<value_type>(proxy.lhs()),
01451                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01452                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01453                                              );
01454         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
01455       }
01456 
01457 
01458       // tanh
01459       template <typename T> __global__ void vec_element_tanh_kernel(
01460           T       * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
01461           T const * vec2, unsigned int start2, unsigned int inc2)
01462       {
01463         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
01464           vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
01465       }
01466 
01467       template <typename T>
01468       void element_op(vector_base<T> & vec1,
01469                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<op_tanh> > const & proxy)
01470       {
01471         typedef T        value_type;
01472 
01473         vec_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01474                                               static_cast<unsigned int>(viennacl::traits::start(vec1)),
01475                                               static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01476                                               static_cast<unsigned int>(viennacl::traits::size(vec1)),
01477                                               detail::cuda_arg<value_type>(proxy.lhs()),
01478                                               static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
01479                                               static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
01480                                              );
01481         VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
01482       }
01483 
01484 
01485 
01487 
01488 
01489       template <typename T>
01490       __global__ void inner_prod_kernel(const T * vec1,
01491                                         unsigned int start1,
01492                                         unsigned int inc1,
01493                                         unsigned int size1,
01494                                         const T * vec2,
01495                                         unsigned int start2,
01496                                         unsigned int inc2,
01497                                         unsigned int size2,
01498                                         T * group_buffer)
01499       {
01500         __shared__ T tmp_buffer[128];
01501         unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
01502         unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
01503 
01504         unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
01505                                      - (  blockIdx.x * size1) / (gridDim.x);
01506 
01507 
01508         T tmp = 0;
01509         for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
01510           tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
01511         tmp_buffer[threadIdx.x] = tmp;
01512 
01513         // parallel reduction
01514         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
01515         {
01516           __syncthreads();
01517           if (threadIdx.x < stride)
01518             tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
01519         }
01520 
01521         if (threadIdx.x == 0)
01522           group_buffer[blockIdx.x] = tmp_buffer[0];
01523 
01524       }
01525 
01526 
01527 
01528       // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
01529       template <typename T>
01530       __global__ void vector_sum_kernel_floats(
01531                 const T * vec1,
01532                 unsigned int start1,
01533                 unsigned int inc1,
01534                 unsigned int size1,
01535                 unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
01536                 T * result)
01537       {
01538         __shared__ T tmp_buffer[128];
01539         T thread_sum = 0;
01540         for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
01541         {
01542           if (option > 0)
01543             thread_sum += vec1[i*inc1+start1];
01544           else
01545             thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
01546         }
01547 
01548         tmp_buffer[threadIdx.x] = thread_sum;
01549 
01550         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
01551         {
01552           __syncthreads();
01553           if (threadIdx.x < stride)
01554           {
01555             if (option > 0)
01556               tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
01557             else
01558               tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
01559           }
01560         }
01561 
01562         if (threadIdx.x == 0)
01563         {
01564           if (option == 2)
01565             *result = sqrt(tmp_buffer[0]);
01566           else
01567             *result = tmp_buffer[0];
01568         }
01569       }
01570 
01571       template <typename T>
01572       __global__ void vector_sum_kernel_integers(
01573                 const T * vec1,
01574                 unsigned int start1,
01575                 unsigned int inc1,
01576                 unsigned int size1,
01577                 unsigned int option, //0: use max, 1: just sum
01578                 T * result)
01579       {
01580         __shared__ T tmp_buffer[128];
01581         T thread_sum = 0;
01582         for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
01583         {
01584           if (option > 0)
01585             thread_sum += vec1[i*inc1+start1];
01586           else
01587             thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
01588         }
01589 
01590         tmp_buffer[threadIdx.x] = thread_sum;
01591 
01592         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
01593         {
01594           __syncthreads();
01595           if (threadIdx.x < stride)
01596           {
01597             if (option > 0)
01598               tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
01599             else
01600               tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
01601           }
01602         }
01603 
01604         if (threadIdx.x == 0)
01605           *result = tmp_buffer[0];
01606       }
01607 
01608       template <typename T>
01609       __global__ void vector_sum_kernel_unsigned_integers(
01610                 const T * vec1,
01611                 unsigned int start1,
01612                 unsigned int inc1,
01613                 unsigned int size1,
01614                 unsigned int option, //0: use max, 1: just sum
01615                 T * result)
01616       {
01617         __shared__ T tmp_buffer[128];
01618         T thread_sum = 0;
01619         for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
01620         {
01621           if (option > 0)
01622             thread_sum += vec1[i*inc1+start1];
01623           else
01624             thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
01625         }
01626 
01627         tmp_buffer[threadIdx.x] = thread_sum;
01628 
01629         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
01630         {
01631           __syncthreads();
01632           if (threadIdx.x < stride)
01633           {
01634             if (option > 0)
01635               tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
01636             else
01637               tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
01638           }
01639         }
01640 
01641         if (threadIdx.x == 0)
01642           *result = tmp_buffer[0];
01643       }
01644 
01645       namespace detail
01646       {
01648         struct vector_sum_kernel_launcher_integers
01649         {
01650           template <typename T, typename S3>
01651           static void apply(vector_base<T> const & temp,
01652                             unsigned int option,
01653                             S3 & result)
01654           {
01655             typedef T        value_type;
01656             vector_sum_kernel_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
01657                                                   static_cast<unsigned int>(viennacl::traits::start(temp)),
01658                                                   static_cast<unsigned int>(viennacl::traits::stride(temp)),
01659                                                   static_cast<unsigned int>(viennacl::traits::size(temp)),
01660                                                   static_cast<unsigned int>(option),
01661                                                   detail::cuda_arg<value_type>(result) );
01662             VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
01663           }
01664         };
01665 
01666         struct vector_sum_kernel_launcher_unsigned_integers
01667         {
01668           template <typename T, typename S3>
01669           static void apply(vector_base<T> const & temp,
01670                             unsigned int option,
01671                             S3 & result)
01672           {
01673             typedef T        value_type;
01674             vector_sum_kernel_unsigned_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
01675                                                             static_cast<unsigned int>(viennacl::traits::start(temp)),
01676                                                             static_cast<unsigned int>(viennacl::traits::stride(temp)),
01677                                                             static_cast<unsigned int>(viennacl::traits::size(temp)),
01678                                                             static_cast<unsigned int>(option),
01679                                                             detail::cuda_arg<value_type>(result) );
01680             VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
01681           }
01682         };
01683 
01684         struct vector_sum_kernel_launcher_floats
01685         {
01686           template <typename T, typename S3>
01687           static void apply(vector_base<T> const & temp,
01688                             unsigned int option,
01689                             S3 & result)
01690           {
01691             typedef T        value_type;
01692             vector_sum_kernel_floats<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
01693                                                   static_cast<unsigned int>(viennacl::traits::start(temp)),
01694                                                   static_cast<unsigned int>(viennacl::traits::stride(temp)),
01695                                                   static_cast<unsigned int>(viennacl::traits::size(temp)),
01696                                                   static_cast<unsigned int>(option),
01697                                                   detail::cuda_arg<value_type>(result) );
01698             VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
01699           }
01700         };
01701 
01702         template <typename T>
01703         struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
01704 
01705         template <>
01706         struct vector_sum_kernel_launcher<unsigned char>  : public vector_sum_kernel_launcher_unsigned_integers {};
01707 
01708         template <>
01709         struct vector_sum_kernel_launcher<unsigned short>  : public vector_sum_kernel_launcher_unsigned_integers {};
01710 
01711         template <>
01712         struct vector_sum_kernel_launcher<unsigned int>  : public vector_sum_kernel_launcher_unsigned_integers {};
01713 
01714         template <>
01715         struct vector_sum_kernel_launcher<unsigned long>  : public vector_sum_kernel_launcher_unsigned_integers {};
01716 
01717         template <>
01718         struct vector_sum_kernel_launcher<float>  : public vector_sum_kernel_launcher_floats {};
01719 
01720         template <>
01721         struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
01722 
01724       }
01725 
01726 
01727       //implementation of inner product:
01728       //namespace {
01735       template <typename T, typename S3>
01736       void inner_prod_impl(vector_base<T> const & vec1,
01737                            vector_base<T> const & vec2,
01738                            S3 & result)
01739       {
01740         typedef T        value_type;
01741 
01742         static const unsigned int work_groups = 128;
01743         static viennacl::vector<value_type> temp(work_groups);
01744 
01745         inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01746                                         static_cast<unsigned int>(viennacl::traits::start(vec1)),
01747                                         static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01748                                         static_cast<unsigned int>(viennacl::traits::size(vec1)),
01749                                         detail::cuda_arg<value_type>(vec2),
01750                                         static_cast<unsigned int>(viennacl::traits::start(vec2)),
01751                                         static_cast<unsigned int>(viennacl::traits::stride(vec2)),
01752                                         static_cast<unsigned int>(viennacl::traits::size(vec2)),
01753                                         detail::cuda_arg<value_type>(temp)
01754                                        );
01755         VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
01756 
01757         detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
01758       }
01759 
01760 
01767       template <typename T>
01768       void inner_prod_cpu(vector_base<T> const & vec1,
01769                           vector_base<T> const & vec2,
01770                           T & result)
01771       {
01772         typedef T        value_type;
01773 
01774         const unsigned int work_groups = 128;
01775         viennacl::vector<value_type> temp(work_groups);
01776 
01777         inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
01778                                         static_cast<unsigned int>(viennacl::traits::start(vec1)),
01779                                         static_cast<unsigned int>(viennacl::traits::stride(vec1)),
01780                                         static_cast<unsigned int>(viennacl::traits::size(vec1)),
01781                                         detail::cuda_arg<value_type>(vec2),
01782                                         static_cast<unsigned int>(viennacl::traits::start(vec2)),
01783                                         static_cast<unsigned int>(viennacl::traits::stride(vec2)),
01784                                         static_cast<unsigned int>(viennacl::traits::size(vec2)),
01785                                         detail::cuda_arg<value_type>(temp)
01786                                        );
01787         VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
01788 
01789         // Now copy partial results from GPU back to CPU and run reduction there:
01790         std::vector<value_type> temp_cpu(work_groups);
01791         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
01792 
01793         result = 0;
01794         for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
01795           result += *it;
01796       }
01797 
01799 
01800 #define VIENNACL_MDOT_WORKGROUP_SIZE  128
01801 #define VIENNACL_MDOT_WORKGROUP_NUM   128
01802       // M = 2:
01803       template <typename NumericT>
01804       __global__ void inner_prod_2_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
01805                                           const NumericT *y0, unsigned int start0, unsigned int stride0,
01806                                           const NumericT *y1, unsigned int start1, unsigned int stride1,
01807                                           NumericT *group_results)
01808       {
01809         __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
01810         unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
01811         unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
01812         unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
01813 
01814         NumericT entry_x    = 0;
01815         NumericT group_sum0 = 0;
01816         NumericT group_sum1 = 0;
01817         for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
01818           entry_x     = x[i * stridex + startx];   // load only once from global memory!
01819           group_sum0 += entry_x * y0[i * stride0 + start0];
01820           group_sum1 += entry_x * y1[i * stride1 + start1];
01821         }
01822         tmp_buffer[threadIdx.x]              = group_sum0;
01823         tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
01824 
01825         // parallel reduction
01826         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
01827           __syncthreads();
01828           if (threadIdx.x < stride) {
01829             tmp_buffer[threadIdx.x             ] += tmp_buffer[threadIdx.x+stride             ];
01830             tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
01831           }
01832         }
01833 
01834         // write result of group to group_results
01835         if (threadIdx.x == 0) {
01836           group_results[blockIdx.x]             = tmp_buffer[0];
01837           group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
01838         }
01839       }
01840 
01841       // M = 3:
01842       template <typename NumericT>
01843       __global__ void inner_prod_3_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
01844                                           const NumericT *y0, unsigned int start0, unsigned int stride0,
01845                                           const NumericT *y1, unsigned int start1, unsigned int stride1,
01846                                           const NumericT *y2, unsigned int start2, unsigned int stride2,
01847                                           NumericT *group_results)
01848       {
01849         __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
01850         unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
01851         unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
01852         unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
01853 
01854         NumericT entry_x    = 0;
01855         NumericT group_sum0 = 0;
01856         NumericT group_sum1 = 0;
01857         NumericT group_sum2 = 0;
01858         for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
01859           entry_x     = x[i * stridex + startx];   // load only once from global memory!
01860           group_sum0 += entry_x * y0[i * stride0 + start0];
01861           group_sum1 += entry_x * y1[i * stride1 + start1];
01862           group_sum2 += entry_x * y2[i * stride2 + start2];
01863         }
01864         tmp_buffer[threadIdx.x]                  = group_sum0;
01865         tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
01866         tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
01867 
01868         // parallel reduction
01869         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
01870           __syncthreads();
01871           if (threadIdx.x < stride) {
01872             tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
01873             tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
01874             tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
01875           }
01876         }
01877 
01878         // write result of group to group_results
01879         if (threadIdx.x == 0) {
01880           group_results[blockIdx.x                ] = tmp_buffer[0];
01881           group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
01882           group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
01883         }
01884       }
01885 
01886       // M = 4:
01887       template <typename NumericT>
01888       __global__ void inner_prod_4_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
01889                                           const NumericT *y0, unsigned int start0, unsigned int stride0,
01890                                           const NumericT *y1, unsigned int start1, unsigned int stride1,
01891                                           const NumericT *y2, unsigned int start2, unsigned int stride2,
01892                                           const NumericT *y3, unsigned int start3, unsigned int stride3,
01893                                           NumericT *group_results)
01894       {
01895         __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
01896         unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
01897         unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
01898         unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
01899 
01900         NumericT entry_x    = 0;
01901         NumericT group_sum0 = 0;
01902         NumericT group_sum1 = 0;
01903         NumericT group_sum2 = 0;
01904         NumericT group_sum3 = 0;
01905         for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
01906           entry_x     = x[i * stridex + startx];   // load only once from global memory!
01907           group_sum0 += entry_x * y0[i * stride0 + start0];
01908           group_sum1 += entry_x * y1[i * stride1 + start1];
01909           group_sum2 += entry_x * y2[i * stride2 + start2];
01910           group_sum3 += entry_x * y3[i * stride3 + start3];
01911         }
01912         tmp_buffer[threadIdx.x]                  = group_sum0;
01913         tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
01914         tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
01915         tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
01916 
01917         // parallel reduction
01918         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
01919           __syncthreads();
01920           if (threadIdx.x < stride) {
01921             tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
01922             tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
01923             tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
01924             tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
01925           }
01926         }
01927 
01928         // write result of group to group_results
01929         if (threadIdx.x == 0) {
01930           group_results[blockIdx.x                ] = tmp_buffer[0];
01931           group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
01932           group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
01933           group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
01934         }
01935       }
01936 
01937       // M = 8:
01938       template <typename NumericT>
01939       __global__ void inner_prod_8_kernel(const NumericT *x,  unsigned int startx, unsigned int stridex, unsigned int sizex,
01940                                           const NumericT *y0, unsigned int start0, unsigned int stride0,
01941                                           const NumericT *y1, unsigned int start1, unsigned int stride1,
01942                                           const NumericT *y2, unsigned int start2, unsigned int stride2,
01943                                           const NumericT *y3, unsigned int start3, unsigned int stride3,
01944                                           const NumericT *y4, unsigned int start4, unsigned int stride4,
01945                                           const NumericT *y5, unsigned int start5, unsigned int stride5,
01946                                           const NumericT *y6, unsigned int start6, unsigned int stride6,
01947                                           const NumericT *y7, unsigned int start7, unsigned int stride7,
01948                                           NumericT *group_results)
01949       {
01950         __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
01951         unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
01952         unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
01953         unsigned int vec_stop_index  = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
01954 
01955         NumericT entry_x    = 0;
01956         NumericT group_sum0 = 0;
01957         NumericT group_sum1 = 0;
01958         NumericT group_sum2 = 0;
01959         NumericT group_sum3 = 0;
01960         NumericT group_sum4 = 0;
01961         NumericT group_sum5 = 0;
01962         NumericT group_sum6 = 0;
01963         NumericT group_sum7 = 0;
01964         for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
01965           entry_x     = x[i * stridex + startx];   // load only once from global memory!
01966           group_sum0 += entry_x * y0[i * stride0 + start0];
01967           group_sum1 += entry_x * y1[i * stride1 + start1];
01968           group_sum2 += entry_x * y2[i * stride2 + start2];
01969           group_sum3 += entry_x * y3[i * stride3 + start3];
01970           group_sum4 += entry_x * y4[i * stride4 + start4];
01971           group_sum5 += entry_x * y5[i * stride5 + start5];
01972           group_sum6 += entry_x * y6[i * stride6 + start6];
01973           group_sum7 += entry_x * y7[i * stride7 + start7];
01974         }
01975         tmp_buffer[threadIdx.x]                  = group_sum0;
01976         tmp_buffer[threadIdx.x +     blockDim.x] = group_sum1;
01977         tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
01978         tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
01979         tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
01980         tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
01981         tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
01982         tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
01983 
01984         // parallel reduction
01985         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
01986           __syncthreads();
01987           if (threadIdx.x < stride) {
01988             tmp_buffer[threadIdx.x                 ] += tmp_buffer[threadIdx.x+stride                 ];
01989             tmp_buffer[threadIdx.x +     blockDim.x] += tmp_buffer[threadIdx.x+stride +     blockDim.x];
01990             tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
01991             tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
01992             tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
01993             tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
01994             tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
01995             tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
01996           }
01997         }
01998 
01999         // write result of group to group_results
02000         if (threadIdx.x == 0) {
02001           group_results[blockIdx.x                ] = tmp_buffer[0];
02002           group_results[blockIdx.x +     gridDim.x] = tmp_buffer[    blockDim.x];
02003           group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
02004           group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
02005           group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
02006           group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
02007           group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
02008           group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
02009         }
02010       }
02011 
02012       // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
02013       template <typename T>
02014       __global__ void vector_multi_sum_kernel(
02015                 T const * vec1,
02016                 T * result,
02017                 unsigned int start_result,
02018                 unsigned int inc_result)
02019       {
02020         __shared__ T tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
02021 
02022         tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
02023 
02024         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02025         {
02026           __syncthreads();
02027           if (threadIdx.x < stride)
02028             tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
02029         }
02030 
02031         if (threadIdx.x == 0)
02032           result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
02033       }
02034 
02035       template <typename T>
02036       void inner_prod_impl(vector_base<T> const & x,
02037                            vector_tuple<T> const & vec_tuple,
02038                            vector_base<T> & result)
02039       {
02040         typedef T        value_type;
02041 
02042         static viennacl::vector<value_type> temp(8 * VIENNACL_MDOT_WORKGROUP_NUM);
02043 
02044         vcl_size_t current_index = 0;
02045         while (vec_tuple.const_size() > current_index)
02046         {
02047           switch (vec_tuple.const_size() - current_index)
02048           {
02049             case 7:
02050             case 6:
02051             case 5:
02052             case 4:
02053             {
02054               vector_base<T> const & y0 = vec_tuple.const_at(current_index);
02055               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
02056               vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
02057               vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
02058 
02059               inner_prod_4_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
02060                                     VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
02061                                                                      static_cast<unsigned int>(viennacl::traits::start(x)),
02062                                                                      static_cast<unsigned int>(viennacl::traits::stride(x)),
02063                                                                      static_cast<unsigned int>(viennacl::traits::size(x)),
02064                                                                      detail::cuda_arg<value_type>(y0),
02065                                                                      static_cast<unsigned int>(viennacl::traits::start(y0)),
02066                                                                      static_cast<unsigned int>(viennacl::traits::stride(y0)),
02067                                                                      detail::cuda_arg<value_type>(y1),
02068                                                                      static_cast<unsigned int>(viennacl::traits::start(y1)),
02069                                                                      static_cast<unsigned int>(viennacl::traits::stride(y1)),
02070                                                                      detail::cuda_arg<value_type>(y2),
02071                                                                      static_cast<unsigned int>(viennacl::traits::start(y2)),
02072                                                                      static_cast<unsigned int>(viennacl::traits::stride(y2)),
02073                                                                      detail::cuda_arg<value_type>(y3),
02074                                                                      static_cast<unsigned int>(viennacl::traits::start(y3)),
02075                                                                      static_cast<unsigned int>(viennacl::traits::stride(y3)),
02076                                                                      detail::cuda_arg<value_type>(temp)
02077                                                                     );
02078               VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_4_kernel");
02079               vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
02080                                                                           detail::cuda_arg<value_type>(result),
02081                                                                           static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
02082                                                                           static_cast<unsigned int>(viennacl::traits::stride(result))
02083                                                                          );
02084               VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
02085             }
02086               current_index += 4;
02087               break;
02088             case 3:
02089             {
02090               vector_base<T> const & y0 = vec_tuple.const_at(current_index);
02091               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
02092               vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
02093 
02094               inner_prod_3_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
02095                                     VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
02096                                                                      static_cast<unsigned int>(viennacl::traits::start(x)),
02097                                                                      static_cast<unsigned int>(viennacl::traits::stride(x)),
02098                                                                      static_cast<unsigned int>(viennacl::traits::size(x)),
02099                                                                      detail::cuda_arg<value_type>(y0),
02100                                                                      static_cast<unsigned int>(viennacl::traits::start(y0)),
02101                                                                      static_cast<unsigned int>(viennacl::traits::stride(y0)),
02102                                                                      detail::cuda_arg<value_type>(y1),
02103                                                                      static_cast<unsigned int>(viennacl::traits::start(y1)),
02104                                                                      static_cast<unsigned int>(viennacl::traits::stride(y1)),
02105                                                                      detail::cuda_arg<value_type>(y2),
02106                                                                      static_cast<unsigned int>(viennacl::traits::start(y2)),
02107                                                                      static_cast<unsigned int>(viennacl::traits::stride(y2)),
02108                                                                      detail::cuda_arg<value_type>(temp)
02109                                                                     );
02110               VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_3_kernel");
02111               vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
02112                                                                           detail::cuda_arg<value_type>(result),
02113                                                                           static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
02114                                                                           static_cast<unsigned int>(viennacl::traits::stride(result))
02115                                                                          );
02116               VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
02117             }
02118               current_index += 3;
02119               break;
02120             case 2:
02121             {
02122               vector_base<T> const & y0 = vec_tuple.const_at(current_index);
02123               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
02124 
02125               inner_prod_2_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
02126                                     VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
02127                                                                      static_cast<unsigned int>(viennacl::traits::start(x)),
02128                                                                      static_cast<unsigned int>(viennacl::traits::stride(x)),
02129                                                                      static_cast<unsigned int>(viennacl::traits::size(x)),
02130                                                                      detail::cuda_arg<value_type>(y0),
02131                                                                      static_cast<unsigned int>(viennacl::traits::start(y0)),
02132                                                                      static_cast<unsigned int>(viennacl::traits::stride(y0)),
02133                                                                      detail::cuda_arg<value_type>(y1),
02134                                                                      static_cast<unsigned int>(viennacl::traits::start(y1)),
02135                                                                      static_cast<unsigned int>(viennacl::traits::stride(y1)),
02136                                                                      detail::cuda_arg<value_type>(temp)
02137                                                                     );
02138               VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_2_kernel");
02139               vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
02140                                                                           detail::cuda_arg<value_type>(result),
02141                                                                           static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
02142                                                                           static_cast<unsigned int>(viennacl::traits::stride(result))
02143                                                                          );
02144               VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
02145             }
02146               current_index += 2;
02147               break;
02148             case 1:
02149             {
02150               vector_base<T> const & y0 = vec_tuple.const_at(current_index);
02151               inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(x),
02152                                               static_cast<unsigned int>(viennacl::traits::start(x)),
02153                                               static_cast<unsigned int>(viennacl::traits::stride(x)),
02154                                               static_cast<unsigned int>(viennacl::traits::size(x)),
02155                                               detail::cuda_arg<value_type>(y0),
02156                                               static_cast<unsigned int>(viennacl::traits::start(y0)),
02157                                               static_cast<unsigned int>(viennacl::traits::stride(y0)),
02158                                               static_cast<unsigned int>(viennacl::traits::size(y0)),
02159                                               detail::cuda_arg<value_type>(temp)
02160                                              );
02161               VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
02162 
02163               vector_multi_sum_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
02164                                                   detail::cuda_arg<value_type>(result),
02165                                                   static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
02166                                                   static_cast<unsigned int>(viennacl::traits::stride(result))
02167                                                  );
02168               VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
02169             }
02170               current_index += 1;
02171               break;
02172 
02173             default:
02174             {
02175               vector_base<T> const & y0 = vec_tuple.const_at(current_index);
02176               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
02177               vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
02178               vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
02179               vector_base<T> const & y4 = vec_tuple.const_at(current_index + 4);
02180               vector_base<T> const & y5 = vec_tuple.const_at(current_index + 5);
02181               vector_base<T> const & y6 = vec_tuple.const_at(current_index + 6);
02182               vector_base<T> const & y7 = vec_tuple.const_at(current_index + 7);
02183 
02184               inner_prod_8_kernel<<<VIENNACL_MDOT_WORKGROUP_NUM,
02185                                     VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
02186                                                                      static_cast<unsigned int>(viennacl::traits::start(x)),
02187                                                                      static_cast<unsigned int>(viennacl::traits::stride(x)),
02188                                                                      static_cast<unsigned int>(viennacl::traits::size(x)),
02189                                                                      detail::cuda_arg<value_type>(y0),
02190                                                                      static_cast<unsigned int>(viennacl::traits::start(y0)),
02191                                                                      static_cast<unsigned int>(viennacl::traits::stride(y0)),
02192                                                                      detail::cuda_arg<value_type>(y1),
02193                                                                      static_cast<unsigned int>(viennacl::traits::start(y1)),
02194                                                                      static_cast<unsigned int>(viennacl::traits::stride(y1)),
02195                                                                      detail::cuda_arg<value_type>(y2),
02196                                                                      static_cast<unsigned int>(viennacl::traits::start(y2)),
02197                                                                      static_cast<unsigned int>(viennacl::traits::stride(y2)),
02198                                                                      detail::cuda_arg<value_type>(y3),
02199                                                                      static_cast<unsigned int>(viennacl::traits::start(y3)),
02200                                                                      static_cast<unsigned int>(viennacl::traits::stride(y3)),
02201                                                                      detail::cuda_arg<value_type>(y4),
02202                                                                      static_cast<unsigned int>(viennacl::traits::start(y4)),
02203                                                                      static_cast<unsigned int>(viennacl::traits::stride(y4)),
02204                                                                      detail::cuda_arg<value_type>(y5),
02205                                                                      static_cast<unsigned int>(viennacl::traits::start(y5)),
02206                                                                      static_cast<unsigned int>(viennacl::traits::stride(y5)),
02207                                                                      detail::cuda_arg<value_type>(y6),
02208                                                                      static_cast<unsigned int>(viennacl::traits::start(y6)),
02209                                                                      static_cast<unsigned int>(viennacl::traits::stride(y6)),
02210                                                                      detail::cuda_arg<value_type>(y7),
02211                                                                      static_cast<unsigned int>(viennacl::traits::start(y7)),
02212                                                                      static_cast<unsigned int>(viennacl::traits::stride(y7)),
02213                                                                      detail::cuda_arg<value_type>(temp)
02214                                                                     );
02215               VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_8_kernel");
02216               vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
02217                                                                           detail::cuda_arg<value_type>(result),
02218                                                                           static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
02219                                                                           static_cast<unsigned int>(viennacl::traits::stride(result))
02220                                                                          );
02221               VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
02222             }
02223               current_index += 8;
02224               break;
02225           }
02226         }
02227       }
02228 
02229 #undef VIENNACL_MDOT_WORKGROUP_NUM
02230 #undef VIENNACL_MDOT_WORKGROUP_SIZE
02231 
02233 
02234       template <typename T>
02235       __global__ void norm_kernel_floats(
02236                  const T * vec,
02237                 unsigned int start1,
02238                 unsigned int inc1,
02239                 unsigned int size1,
02240                 unsigned int norm_selector,
02241                 T * group_buffer)
02242       {
02243         __shared__ T tmp_buffer[128];
02244 
02245         T tmp = 0;
02246         unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
02247         unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
02248         unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
02249         group_stop = (group_stop > size1) ? size1 : group_stop;
02250 
02251         if (norm_selector == 1) //norm_1
02252         {
02253           for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
02254             tmp += fabs(vec[i*inc1 + start1]);
02255         }
02256         else if (norm_selector == 2) //norm_2
02257         {
02258           T vec_entry = 0;
02259           for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
02260           {
02261             vec_entry = vec[i*inc1 + start1];
02262             tmp += vec_entry * vec_entry;
02263           }
02264         }
02265         else if (norm_selector == 0) //norm_inf
02266         {
02267           for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
02268             tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
02269         }
02270 
02271         tmp_buffer[threadIdx.x] = tmp;
02272 
02273         if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
02274         {
02275           for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02276           {
02277             __syncthreads();
02278             if (threadIdx.x < stride)
02279               tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
02280           }
02281         }
02282         else
02283         {
02284           //norm_inf:
02285           for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02286           {
02287             __syncthreads();
02288             if (threadIdx.x < stride)
02289               tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+stride]);
02290           }
02291         }
02292 
02293         if (threadIdx.x == 0)
02294           group_buffer[blockIdx.x] = tmp_buffer[0];
02295       }
02296 
02297       template <typename T>
02298       __global__ void norm_kernel_integers(
02299                  const T * vec,
02300                 unsigned int start1,
02301                 unsigned int inc1,
02302                 unsigned int size1,
02303                 unsigned int norm_selector,
02304                 T * group_buffer)
02305       {
02306         __shared__ T tmp_buffer[128];
02307 
02308         T tmp = 0;
02309         unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
02310         unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
02311         unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
02312         group_stop = (group_stop > size1) ? size1 : group_stop;
02313 
02314         if (norm_selector == 1) //norm_1
02315         {
02316           for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
02317             tmp += abs(vec[i*inc1 + start1]);
02318         }
02319         else if (norm_selector == 0) //norm_inf
02320         {
02321           for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
02322             tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
02323         }
02324 
02325         tmp_buffer[threadIdx.x] = tmp;
02326 
02327         if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
02328         {
02329           for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02330           {
02331             __syncthreads();
02332             if (threadIdx.x < stride)
02333               tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
02334           }
02335         }
02336         else
02337         {
02338           //norm_inf:
02339           for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02340           {
02341             __syncthreads();
02342             if (threadIdx.x < stride)
02343               tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
02344           }
02345         }
02346 
02347         if (threadIdx.x == 0)
02348           group_buffer[blockIdx.x] = tmp_buffer[0];
02349       }
02350 
02351       template <typename T>
02352       __global__ void norm_kernel_unsigned_integers(
02353                  const T * vec,
02354                 unsigned int start1,
02355                 unsigned int inc1,
02356                 unsigned int size1,
02357                 unsigned int norm_selector,
02358                 T * group_buffer)
02359       {
02360         __shared__ T tmp_buffer[128];
02361 
02362         T tmp = 0;
02363         unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
02364         unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
02365         unsigned int group_stop  = (blockIdx.x + 1) * work_per_thread * blockDim.x;
02366         group_stop = (group_stop > size1) ? size1 : group_stop;
02367 
02368         if (norm_selector == 1) //norm_1
02369         {
02370           for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
02371             tmp += vec[i*inc1 + start1];
02372         }
02373         else if (norm_selector == 0) //norm_inf
02374         {
02375           for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
02376             tmp = (tmp > vec[i*inc1 + start1]) ? tmp : vec[i*inc1 + start1];
02377         }
02378 
02379         tmp_buffer[threadIdx.x] = tmp;
02380 
02381         if (norm_selector > 0) //parallel reduction for norm_1 or norm_2:
02382         {
02383           for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02384           {
02385             __syncthreads();
02386             if (threadIdx.x < stride)
02387               tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
02388           }
02389         }
02390         else
02391         {
02392           //norm_inf:
02393           for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02394           {
02395             __syncthreads();
02396             if (threadIdx.x < stride)
02397               tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
02398           }
02399         }
02400 
02401         if (threadIdx.x == 0)
02402           group_buffer[blockIdx.x] = tmp_buffer[0];
02403       }
02404 
02406       namespace detail
02407       {
02408         struct norm_kernel_launcher_integers
02409         {
02410           template <typename T>
02411           static void apply(vector_base<T> const & vec1,
02412                             vector_base<T> & temp,
02413                             unsigned int option)
02414           {
02415             typedef T        value_type;
02416             norm_kernel_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
02417                                                static_cast<unsigned int>(viennacl::traits::start(vec1)),
02418                                                static_cast<unsigned int>(viennacl::traits::stride(vec1)),
02419                                                static_cast<unsigned int>(viennacl::traits::size(vec1)),
02420                                                static_cast<unsigned int>(option),
02421                                                detail::cuda_arg<value_type>(temp)
02422                                               );
02423             VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
02424           }
02425         };
02426 
02427         struct norm_kernel_launcher_unsigned_integers
02428         {
02429           template <typename T>
02430           static void apply(vector_base<T> const & vec1,
02431                             vector_base<T> & temp,
02432                             unsigned int option)
02433           {
02434             typedef T        value_type;
02435             norm_kernel_unsigned_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
02436                                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
02437                                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
02438                                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
02439                                                        static_cast<unsigned int>(option),
02440                                                        detail::cuda_arg<value_type>(temp)
02441                                                       );
02442             VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
02443           }
02444         };
02445 
02446 
02447         struct norm_kernel_launcher_floats
02448         {
02449           template <typename T>
02450           static void apply(vector_base<T> const & vec1,
02451                             vector_base<T> & temp,
02452                             unsigned int option)
02453           {
02454             typedef T        value_type;
02455             norm_kernel_floats<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
02456                                              static_cast<unsigned int>(viennacl::traits::start(vec1)),
02457                                              static_cast<unsigned int>(viennacl::traits::stride(vec1)),
02458                                              static_cast<unsigned int>(viennacl::traits::size(vec1)),
02459                                              static_cast<unsigned int>(option),
02460                                              detail::cuda_arg<value_type>(temp)
02461                                             );
02462             VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
02463           }
02464         };
02465 
02466         template <typename T>
02467         struct norm_kernel_launcher : public norm_kernel_launcher_integers {};
02468 
02469         template <>
02470         struct norm_kernel_launcher<unsigned char>  : public norm_kernel_launcher_unsigned_integers {};
02471 
02472         template <>
02473         struct norm_kernel_launcher<unsigned short>  : public norm_kernel_launcher_unsigned_integers {};
02474 
02475         template <>
02476         struct norm_kernel_launcher<unsigned int>  : public norm_kernel_launcher_unsigned_integers {};
02477 
02478         template <>
02479         struct norm_kernel_launcher<unsigned long>  : public norm_kernel_launcher_unsigned_integers {};
02480 
02481         template <>
02482         struct norm_kernel_launcher<float>  : public norm_kernel_launcher_floats {};
02483 
02484         template <>
02485         struct norm_kernel_launcher<double> : public norm_kernel_launcher_floats {};
02486 
02487       }
02496       template <typename T>
02497       void norm_1_impl(vector_base<T> const & vec1,
02498                        scalar<T> & result)
02499       {
02500         typedef T        value_type;
02501 
02502         vcl_size_t work_groups = 128;
02503         viennacl::vector<value_type> temp(work_groups);
02504 
02505         detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
02506         detail::vector_sum_kernel_launcher<T>::apply(temp, 1, result);
02507       }
02508 
02514       template <typename T>
02515       void norm_1_cpu(vector_base<T> const & vec1,
02516                       T & result)
02517       {
02518         typedef T        value_type;
02519 
02520         vcl_size_t work_groups = 128;
02521         viennacl::vector<value_type> temp(work_groups);
02522 
02523         detail::norm_kernel_launcher<T>::apply(vec1, temp, 1);
02524 
02525         // Now copy partial results from GPU back to CPU and run reduction there:
02526         std::vector<value_type> temp_cpu(work_groups);
02527         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
02528 
02529         result = 0;
02530         for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
02531           result += *it;
02532       }
02533 
02535 
02541       template <typename T>
02542       void norm_2_impl(vector_base<T> const & vec1,
02543                        scalar<T> & result)
02544       {
02545         typedef T       value_type;
02546 
02547         vcl_size_t work_groups = 128;
02548         viennacl::vector<value_type> temp(work_groups);
02549 
02550         detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
02551 
02552         detail::vector_sum_kernel_launcher<T>::apply(temp, 2, result);
02553       }
02554 
02560       template <typename T>
02561       void norm_2_cpu(vector_base<T> const & vec1,
02562                       T & result)
02563       {
02564         typedef T        value_type;
02565 
02566         vcl_size_t work_groups = 128;
02567         viennacl::vector<value_type> temp(work_groups);
02568 
02569         detail::norm_kernel_launcher<T>::apply(vec1, temp, 2);
02570 
02571         std::vector<value_type> temp_cpu(work_groups);
02572         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
02573 
02574         result = 0;
02575         for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
02576           result += *it;
02577         result = std::sqrt(result);
02578       }
02579 
02580 
02582 
02588       template <typename T>
02589       void norm_inf_impl(vector_base<T> const & vec1,
02590                          scalar<T> & result)
02591       {
02592         typedef T      value_type;
02593 
02594         vcl_size_t work_groups = 128;
02595         viennacl::vector<value_type> temp(work_groups);
02596 
02597         detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
02598         detail::vector_sum_kernel_launcher<T>::apply(temp, 0, result);
02599       }
02600 
02601 
02602 
02608       template <typename T>
02609       void norm_inf_cpu(vector_base<T> const & vec1,
02610                         T & result)
02611       {
02612         typedef T        value_type;
02613 
02614         vcl_size_t work_groups = 128;
02615         viennacl::vector<value_type> temp(work_groups);
02616 
02617         detail::norm_kernel_launcher<T>::apply(vec1, temp, 0);
02618 
02619         std::vector<value_type> temp_cpu(work_groups);
02620         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
02621 
02622         result = 0;
02623         for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
02624           result = std::max(result, *it);
02625       }
02626 
02627 
02629 
02630 
02631 
02632       //index_norm_inf:
02633 
02634       // fixes the problem of not having (f)abs available in a consistent manner
02635       template <typename T>
02636       __device__ T              cuda_abs(T val) { return (val < 0) ? -val : val; }
02637       __device__ inline unsigned long  cuda_abs(unsigned long  val) { return val; }
02638       __device__ inline unsigned int   cuda_abs(unsigned int   val) { return val; }
02639       __device__ inline unsigned short cuda_abs(unsigned short val) { return val; }
02640       __device__ inline unsigned char  cuda_abs(unsigned char  val) { return val; }
02641 
02642       template <typename T>
02643       __global__ void index_norm_inf_kernel(const T * vec,
02644                                             unsigned int start1,
02645                                             unsigned int inc1,
02646                                             unsigned int size1,
02647                                             unsigned int * result)
02648       {
02649         __shared__ T float_buffer[128];
02650         __shared__ unsigned int index_buffer[128];
02651 
02652         float_buffer[threadIdx.x] = 0;
02653         index_buffer[threadIdx.x] = 0;
02654 
02655         //step 1: fill buffer:
02656         T cur_max = (T)0;
02657         T tmp;
02658         for (unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
02659         {
02660           tmp = vec[i*inc1+start1];
02661           tmp = cuda_abs(tmp);
02662           if (cur_max < tmp)
02663           {
02664             float_buffer[threadIdx.x] = tmp;
02665             index_buffer[threadIdx.x] = i;
02666             cur_max = tmp;
02667           }
02668         }
02669 
02670         //step 2: parallel reduction:
02671         for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
02672         {
02673           __syncthreads();
02674           if (threadIdx.x < stride)
02675           {
02676             //find the first occurring index
02677             if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+stride])
02678             {
02679               index_buffer[threadIdx.x] = index_buffer[threadIdx.x+stride];
02680               float_buffer[threadIdx.x] = float_buffer[threadIdx.x+stride];
02681             }
02682           }
02683         }
02684 
02685         if (threadIdx.x == 0)
02686           *result = index_buffer[0];
02687       }
02688 
02689       //This function should return a CPU scalar, otherwise statements like
02690       // vcl_rhs[index_norm_inf(vcl_rhs)]
02691       // are ambiguous
02697       template <typename T>
02698       vcl_size_t index_norm_inf(vector_base<T> const & vec1)
02699       {
02700         typedef T       value_type;
02701 
02702         viennacl::backend::mem_handle h;
02703         viennacl::backend::memory_create(h, sizeof(unsigned int), viennacl::traits::context(vec1));
02704 
02705         index_norm_inf_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(vec1),
02706                                           static_cast<unsigned int>(viennacl::traits::start(vec1)),
02707                                           static_cast<unsigned int>(viennacl::traits::stride(vec1)),
02708                                           static_cast<unsigned int>(viennacl::traits::size(vec1)),
02709                                           //detail::cuda_arg<unsigned int>(h.cuda_handle())
02710                                           reinterpret_cast<unsigned int *>(h.cuda_handle().get())
02711                                         );
02712         VIENNACL_CUDA_LAST_ERROR_CHECK("index_norm_inf_kernel");
02713 
02714         unsigned int ret = 0;
02715         viennacl::backend::memory_read(h, 0, sizeof(unsigned int), &ret);
02716         return static_cast<vcl_size_t>(ret);
02717       }
02718 
02720 
02721       template <typename T>
02722       __global__ void plane_rotation_kernel(
02723                 T * vec1,
02724                 unsigned int start1,
02725                 unsigned int inc1,
02726                 unsigned int size1,
02727                 T * vec2,
02728                 unsigned int start2,
02729                 unsigned int inc2,
02730                 unsigned int size2,
02731                 T alpha,
02732                 T beta)
02733       {
02734         T tmp1 = 0;
02735         T tmp2 = 0;
02736 
02737         for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
02738         {
02739           tmp1 = vec1[i*inc1+start1];
02740           tmp2 = vec2[i*inc2+start2];
02741 
02742           vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2;
02743           vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1;
02744         }
02745 
02746       }
02747 
02757       template <typename T>
02758       void plane_rotation(vector_base<T> & vec1,
02759                           vector_base<T> & vec2,
02760                           T alpha, T beta)
02761       {
02762         typedef T     value_type;
02763 
02764         value_type temporary_alpha = 0;
02765         if (viennacl::is_cpu_scalar<value_type>::value)
02766           temporary_alpha = alpha;
02767 
02768         value_type temporary_beta = 0;
02769         if (viennacl::is_cpu_scalar<value_type>::value)
02770           temporary_beta = beta;
02771 
02772         plane_rotation_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
02773                                             static_cast<unsigned int>(viennacl::traits::start(vec1)),
02774                                             static_cast<unsigned int>(viennacl::traits::stride(vec1)),
02775                                             static_cast<unsigned int>(viennacl::traits::size(vec1)),
02776                                             detail::cuda_arg<value_type>(vec2),
02777                                             static_cast<unsigned int>(viennacl::traits::start(vec2)),
02778                                             static_cast<unsigned int>(viennacl::traits::stride(vec2)),
02779                                             static_cast<unsigned int>(viennacl::traits::size(vec2)),
02780                                             detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
02781                                             detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)) );
02782         VIENNACL_CUDA_LAST_ERROR_CHECK("plane_rotation_kernel");
02783       }
02784 
02785     } //namespace opencl
02786   } //namespace linalg
02787 } //namespace viennacl
02788 
02789 
02790 #endif