ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/cuda/matrix_operations.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include "viennacl/forwards.h"
00026 #include "viennacl/scalar.hpp"
00027 #include "viennacl/vector.hpp"
00028 #include "viennacl/vector_proxy.hpp"
00029 #include "viennacl/tools/tools.hpp"
00030 #include "viennacl/meta/enable_if.hpp"
00031 #include "viennacl/meta/predicate.hpp"
00032 #include "viennacl/meta/result_of.hpp"
00033 #include "viennacl/traits/size.hpp"
00034 #include "viennacl/traits/start.hpp"
00035 #include "viennacl/traits/handle.hpp"
00036 #include "viennacl/traits/stride.hpp"
00037 
00038 #include "viennacl/linalg/cuda/common.hpp"
00039 
00040 #include "viennacl/linalg/cuda/vector_operations.hpp"
00041 #include "viennacl/linalg/cuda/matrix_operations_row.hpp"
00042 #include "viennacl/linalg/cuda/matrix_operations_col.hpp"
00043 #include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
00044 #include "viennacl/linalg/cuda/matrix_operations_prod.hpp"
00045 
00046 namespace viennacl
00047 {
00048   namespace linalg
00049   {
00050     namespace cuda
00051     {
00052       //
00053       // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
00054       //
00055 
00056       template <typename NumericT, typename F,
00057                 typename ScalarType1>
00058       void am(matrix_base<NumericT, F> & mat1,
00059               matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
00060       {
00061         typedef NumericT        value_type;
00062 
00063         unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00064 
00065         value_type temporary_alpha = 0;
00066         if (viennacl::is_cpu_scalar<ScalarType1>::value)
00067           temporary_alpha = alpha;
00068 
00069         if (viennacl::is_row_major<F>::value)
00070         {
00071           am_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
00072                                       static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
00073                                       static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
00074                                       static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
00075                                       static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
00076 
00077                                       detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00078                                       options_alpha,
00079                                       detail::cuda_arg<value_type>(mat2),
00080                                       static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
00081                                       static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
00082                                       static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
00083                                     );
00084           VIENNACL_CUDA_LAST_ERROR_CHECK("am_row_kernel");
00085         }
00086         else
00087         {
00088           am_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
00089                                       static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
00090                                       static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
00091                                       static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
00092                                       static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
00093 
00094                                       detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00095                                       options_alpha,
00096                                       detail::cuda_arg<value_type>(mat2),
00097                                       static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
00098                                       static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
00099                                       static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2))
00100                                     );
00101           VIENNACL_CUDA_LAST_ERROR_CHECK("am_col_kernel");
00102         }
00103       }
00104 
00105 
00106       template <typename NumericT, typename F,
00107                 typename ScalarType1, typename ScalarType2>
00108       void ambm(matrix_base<NumericT, F> & mat1,
00109                 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00110                 matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00111       {
00112         typedef NumericT        value_type;
00113 
00114         unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00115 
00116         value_type temporary_alpha = 0;
00117         if (viennacl::is_cpu_scalar<ScalarType1>::value)
00118           temporary_alpha = alpha;
00119 
00120 
00121         unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00122 
00123         value_type temporary_beta = 0;
00124         if (viennacl::is_cpu_scalar<ScalarType2>::value)
00125           temporary_beta = beta;
00126 
00127 
00128         if (viennacl::is_row_major<F>::value)
00129         {
00130           ambm_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
00131                                         static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
00132                                         static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
00133                                         static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
00134                                         static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
00135 
00136                                         detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00137                                         options_alpha,
00138                                         detail::cuda_arg<value_type>(mat2),
00139                                         static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
00140                                         static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
00141                                         static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
00142 
00143                                         detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
00144                                         options_beta,
00145                                         detail::cuda_arg<value_type>(mat3),
00146                                         static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
00147                                         static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
00148                                         static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
00149                                       );
00150           VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_row_kernel");
00151         }
00152         else
00153         {
00154           ambm_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
00155                                         static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
00156                                         static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
00157                                         static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
00158                                         static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
00159 
00160                                         detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00161                                         options_alpha,
00162                                         detail::cuda_arg<value_type>(mat2),
00163                                         static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
00164                                         static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
00165                                         static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
00166 
00167                                         detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
00168                                         options_beta,
00169                                         detail::cuda_arg<value_type>(mat3),
00170                                         static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
00171                                         static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
00172                                         static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
00173                                       );
00174           VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_col_kernel");
00175         }
00176 
00177       }
00178 
00179 
00180       template <typename NumericT, typename F,
00181                 typename ScalarType1, typename ScalarType2>
00182       void ambm_m(matrix_base<NumericT, F> & mat1,
00183                   matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00184                   matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00185       {
00186         typedef NumericT        value_type;
00187 
00188         unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00189 
00190         value_type temporary_alpha = 0;
00191         if (viennacl::is_cpu_scalar<ScalarType1>::value)
00192           temporary_alpha = alpha;
00193 
00194 
00195         unsigned int options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00196 
00197         value_type temporary_beta = 0;
00198         if (viennacl::is_cpu_scalar<ScalarType2>::value)
00199           temporary_beta = beta;
00200 
00201 
00202         if (viennacl::is_row_major<F>::value)
00203         {
00204           ambm_m_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
00205                                           static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
00206                                           static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
00207                                           static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
00208                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
00209 
00210                                           detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00211                                           options_alpha,
00212                                           detail::cuda_arg<value_type>(mat2),
00213                                           static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
00214                                           static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
00215                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
00216 
00217                                           detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
00218                                           options_beta,
00219                                           detail::cuda_arg<value_type>(mat3),
00220                                           static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
00221                                           static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
00222                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
00223                                         );
00224           VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_row_kernel");
00225         }
00226         else
00227         {
00228           ambm_m_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
00229                                           static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
00230                                           static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
00231                                           static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
00232                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
00233 
00234                                           detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
00235                                           options_alpha,
00236                                           detail::cuda_arg<value_type>(mat2),
00237                                           static_cast<unsigned int>(viennacl::traits::start1(mat2)),           static_cast<unsigned int>(viennacl::traits::start2(mat2)),
00238                                           static_cast<unsigned int>(viennacl::traits::stride1(mat2)),          static_cast<unsigned int>(viennacl::traits::stride2(mat2)),
00239                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat2)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat2)),
00240 
00241                                           detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
00242                                           options_beta,
00243                                           detail::cuda_arg<value_type>(mat3),
00244                                           static_cast<unsigned int>(viennacl::traits::start1(mat3)),           static_cast<unsigned int>(viennacl::traits::start2(mat3)),
00245                                           static_cast<unsigned int>(viennacl::traits::stride1(mat3)),          static_cast<unsigned int>(viennacl::traits::stride2(mat3)),
00246                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat3)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat3))
00247                                         );
00248           VIENNACL_CUDA_LAST_ERROR_CHECK("ambm_m_col_kernel");
00249         }
00250 
00251       }
00252 
00253 
00254 
00255 
00256       template <typename NumericT, typename F>
00257       void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false)
00258       {
00259         typedef NumericT        value_type;
00260         value_type alpha = s;
00261 
00262         unsigned int s1  = clear ? viennacl::traits::internal_size1(mat) : viennacl::traits::size1(mat);
00263         unsigned int s2  = clear ? viennacl::traits::internal_size2(mat) : viennacl::traits::size2(mat);
00264 
00265         if (viennacl::is_row_major<F>::value)
00266         {
00267 
00268           matrix_row_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
00269                                                  static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
00270                                                  static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
00271                                                  s1,                                                                 s2,
00272                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
00273                                                  alpha);
00274           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_assign_kernel");
00275         }
00276         else
00277         {
00278           matrix_col_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
00279                                                   static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
00280                                                   static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
00281                                                   s1,                                                                 s2,
00282                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
00283                                                   alpha);
00284           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_assign_kernel");
00285         }
00286       }
00287 
00288       template <typename NumericT, typename F>
00289       void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s)
00290       {
00291         typedef NumericT        value_type;
00292         value_type alpha = s;
00293 
00294         if (viennacl::is_row_major<F>::value)
00295         {
00296           matrix_row_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
00297                                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
00298                                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
00299                                                           static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
00300                                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
00301                                                           alpha);
00302           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_diagonal_assign_kernel");
00303         }
00304         else
00305         {
00306           matrix_col_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
00307                                                           static_cast<unsigned int>(viennacl::traits::start1(mat)),           static_cast<unsigned int>(viennacl::traits::start2(mat)),
00308                                                           static_cast<unsigned int>(viennacl::traits::stride1(mat)),          static_cast<unsigned int>(viennacl::traits::stride2(mat)),
00309                                                           static_cast<unsigned int>(viennacl::traits::size1(mat)),            static_cast<unsigned int>(viennacl::traits::size2(mat)),
00310                                                           static_cast<unsigned int>(viennacl::traits::internal_size1(mat)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
00311                                                           alpha);
00312           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_diagonal_assign_kernel");
00313         }
00314       }
00315 
00316 
00317       template <typename NumericT, typename F>
00318       void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT, F> & mat)
00319       {
00320         typedef NumericT        value_type;
00321 
00322         // Step 1: assign zero matrix:
00323         matrix_assign(mat, NumericT(0));
00324 
00325         // Step 2: Assign diagonal:
00326         unsigned int options_alpha = 0;
00327 
00328         vcl_size_t mat_start = 0;
00329         vcl_size_t mat_stride = 0;
00330         vcl_size_t mat_size = viennacl::traits::size(vec);
00331         if (viennacl::is_row_major<F>::value)
00332         {
00333           vcl_size_t first_row_index = 0;
00334           vcl_size_t first_col_index = 0;
00335           if (k < 0)
00336             first_row_index = vcl_size_t(-k);
00337           else
00338             first_col_index = vcl_size_t(k);
00339           mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
00340                        + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
00341           mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
00342         }
00343         else
00344         {
00345           vcl_size_t first_row_index = 0;
00346           vcl_size_t first_col_index = 0;
00347           if (k < 0)
00348             first_row_index = vcl_size_t(-k);
00349           else
00350             first_col_index = vcl_size_t(k);
00351           mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
00352                        + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
00353           mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
00354         }
00355 
00356         av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
00357                                 static_cast<unsigned int>(mat_start),
00358                                 static_cast<unsigned int>(mat_stride),
00359                                 static_cast<unsigned int>(mat_size),
00360 
00361                                 detail::cuda_arg<value_type>(NumericT(1)),
00362                                 options_alpha,
00363                                 detail::cuda_arg<value_type>(vec),
00364                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
00365                                 static_cast<unsigned int>(viennacl::traits::stride(vec)) );
00366         VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
00367       }
00368 
00369       template <typename NumericT, typename F>
00370       void matrix_diag_to_vector(const matrix_base<NumericT, F> & mat, int k, vector_base<NumericT> & vec)
00371       {
00372         typedef NumericT        value_type;
00373 
00374         unsigned int options_alpha = 0;
00375 
00376         vcl_size_t mat_start = 0;
00377         vcl_size_t mat_stride = 0;
00378         if (viennacl::is_row_major<F>::value)
00379         {
00380           vcl_size_t first_row_index = 0;
00381           vcl_size_t first_col_index = 0;
00382           if (k < 0)
00383             first_row_index = vcl_size_t(-k);
00384           else
00385             first_col_index = vcl_size_t(k);
00386           mat_start  =  (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
00387                        + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat);
00388           mat_stride = viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat);
00389         }
00390         else
00391         {
00392           vcl_size_t first_row_index = 0;
00393           vcl_size_t first_col_index = 0;
00394           if (k < 0)
00395             first_row_index = vcl_size_t(-k);
00396           else
00397             first_col_index = vcl_size_t(k);
00398           mat_start  =    viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
00399                        + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
00400           mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat);
00401         }
00402 
00403         av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
00404                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
00405                                 static_cast<unsigned int>(viennacl::traits::stride(vec)),
00406                                 static_cast<unsigned int>(viennacl::traits::size(vec)),
00407 
00408                                 detail::cuda_arg<value_type>(NumericT(1)),
00409                                 options_alpha,
00410                                 detail::cuda_arg<value_type>(mat),
00411                                 static_cast<unsigned int>(mat_start),
00412                                 static_cast<unsigned int>(mat_stride));
00413         VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
00414       }
00415 
00416       template <typename NumericT, typename F>
00417       void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec)
00418       {
00419         typedef NumericT        value_type;
00420 
00421         unsigned int options_alpha = 0;
00422 
00423         vcl_size_t mat_start = 0;
00424         vcl_size_t mat_stride = 0;
00425         if (viennacl::is_row_major<F>::value)
00426         {
00427           mat_start  = (viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat);
00428           mat_stride = viennacl::traits::stride2(mat);
00429         }
00430         else
00431         {
00432           mat_start  = viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat);
00433           mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat);
00434         }
00435 
00436         av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
00437                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
00438                                 static_cast<unsigned int>(viennacl::traits::stride(vec)),
00439                                 static_cast<unsigned int>(viennacl::traits::size(vec)),
00440 
00441                                 detail::cuda_arg<value_type>(NumericT(1)),
00442                                 options_alpha,
00443                                 detail::cuda_arg<value_type>(mat),
00444                                 static_cast<unsigned int>(mat_start),
00445                                 static_cast<unsigned int>(mat_stride));
00446         VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
00447       }
00448 
00449       template <typename NumericT, typename F>
00450       void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec)
00451       {
00452         typedef NumericT        value_type;
00453 
00454         unsigned int options_alpha = 0;
00455 
00456         vcl_size_t mat_start = 0;
00457         vcl_size_t mat_stride = 0;
00458         if (viennacl::is_row_major<F>::value)
00459         {
00460           mat_start  = viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat);
00461           mat_stride = viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat);
00462         }
00463         else
00464         {
00465           mat_start  = viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat);
00466           mat_stride = viennacl::traits::stride2(mat);
00467         }
00468 
00469         av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
00470                                 static_cast<unsigned int>(viennacl::traits::start(vec)),
00471                                 static_cast<unsigned int>(viennacl::traits::stride(vec)),
00472                                 static_cast<unsigned int>(viennacl::traits::size(vec)),
00473 
00474                                 detail::cuda_arg<value_type>(NumericT(1)),
00475                                 options_alpha,
00476                                 detail::cuda_arg<value_type>(mat),
00477                                 static_cast<unsigned int>(mat_start),
00478                                 static_cast<unsigned int>(mat_stride));
00479         VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
00480       }
00481 
00482 
00483       //
00485       //
00486 
00487 
00488       template <typename T, typename F, typename OP>
00489       void element_op(matrix_base<T, F> & A,
00490                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
00491       {
00492         typedef T        value_type;
00493 
00494         unsigned int op_type = 2; //0: product, 1: division, 2: power
00495         if (viennacl::is_division<OP>::value)
00496           op_type = 1;
00497         else if (viennacl::is_product<OP>::value)
00498           op_type = 0;
00499 
00500         if (viennacl::is_row_major<F>::value)
00501         {
00502           element_op_int_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00503                                               static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00504                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00505                                               static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00506                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00507 
00508                                               detail::cuda_arg<value_type>(proxy.lhs()),
00509                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00510                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00511                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
00512 
00513                                               detail::cuda_arg<value_type>(proxy.rhs()),
00514                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
00515                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
00516                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
00517 
00518                                               op_type
00519                                             );
00520           VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
00521         }
00522         else
00523         {
00524           element_op_int_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00525                                               static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00526                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00527                                               static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00528                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00529 
00530                                               detail::cuda_arg<value_type>(proxy.lhs()),
00531                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00532                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00533                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
00534 
00535                                               detail::cuda_arg<value_type>(proxy.rhs()),
00536                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
00537                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
00538                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
00539 
00540                                               op_type
00541                                             );
00542           VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
00543         }
00544       }
00545 
00546       template <typename F, typename OP>
00547       void element_op(matrix_base<float, F> & A,
00548                       matrix_expression<const matrix_base<float, F>, const matrix_base<float, F>, op_element_binary<OP> > const & proxy)
00549       {
00550         typedef float        value_type;
00551 
00552         unsigned int op_type = 2; //0: product, 1: division, 2: power
00553         if (viennacl::is_division<OP>::value)
00554           op_type = 1;
00555         else if (viennacl::is_product<OP>::value)
00556           op_type = 0;
00557 
00558         if (viennacl::is_row_major<F>::value)
00559         {
00560           element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00561                                               static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00562                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00563                                               static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00564                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00565 
00566                                               detail::cuda_arg<value_type>(proxy.lhs()),
00567                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00568                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00569                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
00570 
00571                                               detail::cuda_arg<value_type>(proxy.rhs()),
00572                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
00573                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
00574                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
00575 
00576                                               op_type
00577                                             );
00578           VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
00579         }
00580         else
00581         {
00582           element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00583                                               static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00584                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00585                                               static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00586                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00587 
00588                                               detail::cuda_arg<value_type>(proxy.lhs()),
00589                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00590                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00591                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
00592 
00593                                               detail::cuda_arg<value_type>(proxy.rhs()),
00594                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
00595                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
00596                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
00597 
00598                                               op_type
00599                                             );
00600           VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
00601         }
00602       }
00603 
00604       template <typename F, typename OP>
00605       void element_op(matrix_base<double, F> & A,
00606                       matrix_expression<const matrix_base<double, F>, const matrix_base<double, F>, op_element_binary<OP> > const & proxy)
00607       {
00608         typedef double        value_type;
00609 
00610         unsigned int op_type = 2; //0: product, 1: division, 2: power
00611         if (viennacl::is_division<OP>::value)
00612           op_type = 1;
00613         else if (viennacl::is_product<OP>::value)
00614           op_type = 0;
00615 
00616         if (viennacl::is_row_major<F>::value)
00617         {
00618           element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00619                                               static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00620                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00621                                               static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00622                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00623 
00624                                               detail::cuda_arg<value_type>(proxy.lhs()),
00625                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00626                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00627                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
00628 
00629                                               detail::cuda_arg<value_type>(proxy.rhs()),
00630                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
00631                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
00632                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
00633 
00634                                               op_type
00635                                             );
00636           VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_row_kernel");
00637         }
00638         else
00639         {
00640           element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00641                                               static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00642                                               static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00643                                               static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00644                                               static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00645 
00646                                               detail::cuda_arg<value_type>(proxy.lhs()),
00647                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00648                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00649                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs())),
00650 
00651                                               detail::cuda_arg<value_type>(proxy.rhs()),
00652                                               static_cast<unsigned int>(viennacl::traits::start1(proxy.rhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.rhs())),
00653                                               static_cast<unsigned int>(viennacl::traits::stride1(proxy.rhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.rhs())),
00654                                               static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.rhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.rhs())),
00655 
00656                                               op_type
00657                                             );
00658           VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_col_kernel");
00659         }
00660       }
00661 
00662       //
00664       //
00665 
00666       // Note: Due to CUDA vs C-proprocessor interference (concatenation seems to be broken in at least CUDA 4.2),
00667       //       we could not find a more 'automatic' way of generating the overloads below...
00668 
00669       // abs
00670       template <typename T, typename F>
00671       void element_op(matrix_base<T, F> & A,
00672                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_abs> > const & proxy)
00673       {
00674         typedef T        value_type;
00675 
00676         if (viennacl::is_row_major<F>::value)
00677         {
00678           matrix_row_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00679             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00680             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00681             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00682             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00683 
00684             detail::cuda_arg<value_type>(proxy.lhs()),
00685             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00686             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00687             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00688           );
00689           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_abs_kernel");
00690         }
00691         else
00692         {
00693           matrix_col_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00694             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00695             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00696             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00697             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00698 
00699             detail::cuda_arg<value_type>(proxy.lhs()),
00700             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00701             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00702             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00703           );
00704           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_abs_kernel");
00705         }
00706       }
00707 
00708 
00709       // acos
00710       template <typename T, typename F>
00711       void element_op(matrix_base<T, F> & A,
00712                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_acos> > const & proxy)
00713       {
00714         typedef T        value_type;
00715 
00716         if (viennacl::is_row_major<F>::value)
00717         {
00718           matrix_row_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00719            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00720            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00721            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00722            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00723 
00724            detail::cuda_arg<value_type>(proxy.lhs()),
00725            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00726            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00727            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00728           );
00729           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_acos_kernel");
00730         }
00731         else
00732         {
00733           matrix_col_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00734            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00735            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00736            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00737            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00738 
00739            detail::cuda_arg<value_type>(proxy.lhs()),
00740            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00741            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00742            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00743           );
00744           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_acos_kernel");
00745         }
00746       }
00747 
00748 
00749       // asin
00750       template <typename T, typename F>
00751       void element_op(matrix_base<T, F> & A,
00752                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_asin> > const & proxy)
00753       {
00754         typedef T        value_type;
00755 
00756         if (viennacl::is_row_major<F>::value)
00757         {
00758           matrix_row_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00759            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00760            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00761            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00762            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00763 
00764            detail::cuda_arg<value_type>(proxy.lhs()),
00765            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00766            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00767            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00768           );
00769           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_asin_kernel");
00770         }
00771         else
00772         {
00773           matrix_col_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00774            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00775            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00776            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00777            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00778 
00779            detail::cuda_arg<value_type>(proxy.lhs()),
00780            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00781            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00782            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00783           );
00784           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
00785         }
00786       }
00787 
00788 
00789       // atan
00790       template <typename T, typename F>
00791       void element_op(matrix_base<T, F> & A,
00792                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_atan> > const & proxy)
00793       {
00794         typedef T        value_type;
00795 
00796         if (viennacl::is_row_major<F>::value)
00797         {
00798           matrix_row_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00799            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00800            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00801            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00802            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00803 
00804            detail::cuda_arg<value_type>(proxy.lhs()),
00805            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00806            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00807            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00808           );
00809           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_atan_kernel");
00810         }
00811         else
00812         {
00813           matrix_col_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00814            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00815            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00816            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00817            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00818 
00819            detail::cuda_arg<value_type>(proxy.lhs()),
00820            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00821            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00822            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00823           );
00824           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_atan_kernel");
00825         }
00826       }
00827 
00828 
00829       // ceil
00830       template <typename T, typename F>
00831       void element_op(matrix_base<T, F> & A,
00832                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_ceil> > const & proxy)
00833       {
00834         typedef T        value_type;
00835 
00836         if (viennacl::is_row_major<F>::value)
00837         {
00838           matrix_row_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00839            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00840            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00841            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00842            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00843 
00844            detail::cuda_arg<value_type>(proxy.lhs()),
00845            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00846            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00847            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00848           );
00849           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_ceil_kernel");
00850         }
00851         else
00852         {
00853           matrix_col_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00854            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00855            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00856            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00857            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00858 
00859            detail::cuda_arg<value_type>(proxy.lhs()),
00860            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00861            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00862            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00863           );
00864           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_ceil_kernel");
00865         }
00866       }
00867 
00868 
00869       // cos
00870       template <typename T, typename F>
00871       void element_op(matrix_base<T, F> & A,
00872                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_cos> > const & proxy)
00873       {
00874         typedef T        value_type;
00875 
00876         if (viennacl::is_row_major<F>::value)
00877         {
00878           matrix_row_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00879             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00880             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00881             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00882             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00883 
00884             detail::cuda_arg<value_type>(proxy.lhs()),
00885             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00886             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00887             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00888           );
00889           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cos_kernel");
00890         }
00891         else
00892         {
00893           matrix_col_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00894             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00895             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00896             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00897             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00898 
00899             detail::cuda_arg<value_type>(proxy.lhs()),
00900             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00901             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00902             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00903           );
00904           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cos_kernel");
00905         }
00906       }
00907 
00908 
00909       // cosh
00910       template <typename T, typename F>
00911       void element_op(matrix_base<T, F> & A,
00912                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_cosh> > const & proxy)
00913       {
00914         typedef T        value_type;
00915 
00916         if (viennacl::is_row_major<F>::value)
00917         {
00918           matrix_row_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00919            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00920            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00921            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00922            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00923 
00924            detail::cuda_arg<value_type>(proxy.lhs()),
00925            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00926            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00927            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00928           );
00929           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_cosh_kernel");
00930         }
00931         else
00932         {
00933           matrix_col_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00934            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00935            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00936            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00937            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00938 
00939            detail::cuda_arg<value_type>(proxy.lhs()),
00940            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00941            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00942            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00943           );
00944           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_cosh_kernel");
00945         }
00946       }
00947 
00948 
00949       // exp
00950       template <typename T, typename F>
00951       void element_op(matrix_base<T, F> & A,
00952                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_exp> > const & proxy)
00953       {
00954         typedef T        value_type;
00955 
00956         if (viennacl::is_row_major<F>::value)
00957         {
00958           matrix_row_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00959             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00960             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00961             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00962             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00963 
00964             detail::cuda_arg<value_type>(proxy.lhs()),
00965             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00966             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00967             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00968           );
00969           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_exp_kernel");
00970         }
00971         else
00972         {
00973           matrix_col_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00974             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
00975             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
00976             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
00977             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
00978 
00979             detail::cuda_arg<value_type>(proxy.lhs()),
00980             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
00981             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
00982             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
00983           );
00984           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_exp_kernel");
00985         }
00986       }
00987 
00988 
00989       // fabs
00990       template <typename T, typename F>
00991       void element_op(matrix_base<T, F> & A,
00992                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_fabs> > const & proxy)
00993       {
00994         typedef T        value_type;
00995 
00996         if (viennacl::is_row_major<F>::value)
00997         {
00998           matrix_row_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
00999            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01000            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01001            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01002            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01003 
01004            detail::cuda_arg<value_type>(proxy.lhs()),
01005            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01006            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01007            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01008           );
01009           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_fabs_kernel");
01010         }
01011         else
01012         {
01013           matrix_col_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01014            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01015            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01016            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01017            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01018 
01019            detail::cuda_arg<value_type>(proxy.lhs()),
01020            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01021            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01022            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01023           );
01024           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_fabs_kernel");
01025         }
01026       }
01027 
01028 
01029       // floor
01030       template <typename T, typename F>
01031       void element_op(matrix_base<T, F> & A,
01032                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_floor> > const & proxy)
01033       {
01034         typedef T        value_type;
01035 
01036         if (viennacl::is_row_major<F>::value)
01037         {
01038           matrix_row_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01039             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01040             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01041             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01042             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01043 
01044             detail::cuda_arg<value_type>(proxy.lhs()),
01045             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01046             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01047             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01048           );
01049           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_floor_kernel");
01050         }
01051         else
01052         {
01053           matrix_col_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01054             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01055             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01056             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01057             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01058 
01059             detail::cuda_arg<value_type>(proxy.lhs()),
01060             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01061             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01062             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01063           );
01064           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_floor_kernel");
01065         }
01066       }
01067 
01068 
01069       // log
01070       template <typename T, typename F>
01071       void element_op(matrix_base<T, F> & A,
01072                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_log> > const & proxy)
01073       {
01074         typedef T        value_type;
01075 
01076         if (viennacl::is_row_major<F>::value)
01077         {
01078           matrix_row_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01079             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01080             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01081             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01082             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01083 
01084             detail::cuda_arg<value_type>(proxy.lhs()),
01085             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01086             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01087             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01088           );
01089           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log_kernel");
01090         }
01091         else
01092         {
01093           matrix_col_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01094             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01095             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01096             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01097             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01098 
01099             detail::cuda_arg<value_type>(proxy.lhs()),
01100             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01101             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01102             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01103           );
01104           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log_kernel");
01105         }
01106       }
01107 
01108 
01109       // log10
01110       template <typename T, typename F>
01111       void element_op(matrix_base<T, F> & A,
01112                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_log10> > const & proxy)
01113       {
01114         typedef T        value_type;
01115 
01116         if (viennacl::is_row_major<F>::value)
01117         {
01118           matrix_row_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01119             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01120             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01121             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01122             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01123 
01124             detail::cuda_arg<value_type>(proxy.lhs()),
01125             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01126             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01127             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01128           );
01129           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_log10_kernel");
01130         }
01131         else
01132         {
01133           matrix_col_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01134             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01135             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01136             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01137             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01138 
01139             detail::cuda_arg<value_type>(proxy.lhs()),
01140             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01141             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01142             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01143           );
01144           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_log10_kernel");
01145         }
01146       }
01147 
01148 
01149       // sin
01150       template <typename T, typename F>
01151       void element_op(matrix_base<T, F> & A,
01152                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sin> > const & proxy)
01153       {
01154         typedef T        value_type;
01155 
01156         if (viennacl::is_row_major<F>::value)
01157         {
01158           matrix_row_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01159             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01160             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01161             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01162             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01163 
01164             detail::cuda_arg<value_type>(proxy.lhs()),
01165             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01166             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01167             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01168           );
01169           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sin_kernel");
01170         }
01171         else
01172         {
01173           matrix_col_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01174             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01175             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01176             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01177             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01178 
01179             detail::cuda_arg<value_type>(proxy.lhs()),
01180             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01181             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01182             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01183           );
01184           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sin_kernel");
01185         }
01186       }
01187 
01188 
01189       // sinh
01190       template <typename T, typename F>
01191       void element_op(matrix_base<T, F> & A,
01192                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sinh> > const & proxy)
01193       {
01194         typedef T        value_type;
01195 
01196         if (viennacl::is_row_major<F>::value)
01197         {
01198           matrix_row_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01199            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01200            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01201            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01202            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01203 
01204            detail::cuda_arg<value_type>(proxy.lhs()),
01205            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01206            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01207            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01208           );
01209           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sinh_kernel");
01210         }
01211         else
01212         {
01213           matrix_col_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01214            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01215            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01216            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01217            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01218 
01219            detail::cuda_arg<value_type>(proxy.lhs()),
01220            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01221            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01222            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01223           );
01224           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sinh_kernel");
01225         }
01226       }
01227 
01228 
01229       // sqrt
01230       template <typename T, typename F>
01231       void element_op(matrix_base<T, F> & A,
01232                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_sqrt> > const & proxy)
01233       {
01234         typedef T        value_type;
01235 
01236         if (viennacl::is_row_major<F>::value)
01237         {
01238           matrix_row_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01239            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01240            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01241            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01242            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01243 
01244            detail::cuda_arg<value_type>(proxy.lhs()),
01245            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01246            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01247            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01248           );
01249           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_sqrt_kernel");
01250         }
01251         else
01252         {
01253           matrix_col_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01254            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01255            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01256            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01257            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01258 
01259            detail::cuda_arg<value_type>(proxy.lhs()),
01260            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01261            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01262            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01263           );
01264           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_sqrt_kernel");
01265         }
01266       }
01267 
01268 
01269       // tan
01270       template <typename T, typename F>
01271       void element_op(matrix_base<T, F> & A,
01272                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_tan> > const & proxy)
01273       {
01274         typedef T        value_type;
01275 
01276         if (viennacl::is_row_major<F>::value)
01277         {
01278           matrix_row_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01279             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01280             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01281             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01282             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01283 
01284             detail::cuda_arg<value_type>(proxy.lhs()),
01285             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01286             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01287             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01288           );
01289           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tan_kernel");
01290         }
01291         else
01292         {
01293           matrix_col_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01294             static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01295             static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01296             static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01297             static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01298 
01299             detail::cuda_arg<value_type>(proxy.lhs()),
01300             static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01301             static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01302             static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01303           );
01304           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tan_kernel");
01305         }
01306       }
01307 
01308 
01309       // tanh
01310       template <typename T, typename F>
01311       void element_op(matrix_base<T, F> & A,
01312                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<op_tanh> > const & proxy)
01313       {
01314         typedef T        value_type;
01315 
01316         if (viennacl::is_row_major<F>::value)
01317         {
01318           matrix_row_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01319            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01320            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01321            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01322            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01323 
01324            detail::cuda_arg<value_type>(proxy.lhs()),
01325            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01326            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01327            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01328           );
01329           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_row_element_tanh_kernel");
01330         }
01331         else
01332         {
01333           matrix_col_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
01334            static_cast<unsigned int>(viennacl::traits::start1(A)),           static_cast<unsigned int>(viennacl::traits::start2(A)),
01335            static_cast<unsigned int>(viennacl::traits::stride1(A)),          static_cast<unsigned int>(viennacl::traits::stride2(A)),
01336            static_cast<unsigned int>(viennacl::traits::size1(A)),            static_cast<unsigned int>(viennacl::traits::size2(A)),
01337            static_cast<unsigned int>(viennacl::traits::internal_size1(A)),   static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01338 
01339            detail::cuda_arg<value_type>(proxy.lhs()),
01340            static_cast<unsigned int>(viennacl::traits::start1(proxy.lhs())),           static_cast<unsigned int>(viennacl::traits::start2(proxy.lhs())),
01341            static_cast<unsigned int>(viennacl::traits::stride1(proxy.lhs())),          static_cast<unsigned int>(viennacl::traits::stride2(proxy.lhs())),
01342            static_cast<unsigned int>(viennacl::traits::internal_size1(proxy.lhs())),   static_cast<unsigned int>(viennacl::traits::internal_size2(proxy.lhs()))
01343           );
01344           VIENNACL_CUDA_LAST_ERROR_CHECK("matrix_col_element_tanh_kernel");
01345         }
01346       }
01347 
01348 
01349       //
01351       //
01352 
01353       // A * x
01354 
01363       template <typename NumericT, typename F>
01364       void prod_impl(const matrix_base<NumericT, F> & mat,
01365                      const vector_base<NumericT> & vec,
01366                            vector_base<NumericT> & result)
01367       {
01368         typedef NumericT        value_type;
01369 
01370         assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!"));
01371 
01372         if (viennacl::is_row_major<F>::value)
01373         {
01374           vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
01375                                            static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
01376                                            static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
01377                                            static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
01378                                            static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
01379 
01380                                            detail::cuda_arg<value_type>(vec),
01381                                            static_cast<unsigned int>(viennacl::traits::start(vec)),
01382                                            static_cast<unsigned int>(viennacl::traits::stride(vec)),
01383                                            static_cast<unsigned int>(viennacl::traits::size(vec)),
01384 
01385                                            detail::cuda_arg<value_type>(result),
01386                                            static_cast<unsigned int>(viennacl::traits::start(result)),
01387                                            static_cast<unsigned int>(viennacl::traits::stride(result)),
01388                                            static_cast<unsigned int>(viennacl::traits::size(result))
01389                                           );
01390           VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_row_kernel");
01391         }
01392         else
01393         {
01394           vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
01395                                            static_cast<unsigned int>(viennacl::traits::start1(mat)),         static_cast<unsigned int>(viennacl::traits::start2(mat)),
01396                                            static_cast<unsigned int>(viennacl::traits::stride1(mat)),        static_cast<unsigned int>(viennacl::traits::stride2(mat)),
01397                                            static_cast<unsigned int>(viennacl::traits::size1(mat)),          static_cast<unsigned int>(viennacl::traits::size2(mat)),
01398                                            static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
01399 
01400                                            detail::cuda_arg<value_type>(vec),
01401                                            static_cast<unsigned int>(viennacl::traits::start(vec)),
01402                                            static_cast<unsigned int>(viennacl::traits::stride(vec)),
01403                                            static_cast<unsigned int>(viennacl::traits::size(vec)),
01404 
01405                                            detail::cuda_arg<value_type>(result),
01406                                            static_cast<unsigned int>(viennacl::traits::start(result)),
01407                                            static_cast<unsigned int>(viennacl::traits::stride(result)),
01408                                            static_cast<unsigned int>(viennacl::traits::size(result))
01409                                           );
01410           VIENNACL_CUDA_LAST_ERROR_CHECK("vec_mul_col_kernel");
01411         }
01412       }
01413 
01414 
01415       // trans(A) * x
01416 
01425       template <typename NumericT, typename F>
01426       void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
01427                      const vector_base<NumericT> & vec,
01428                            vector_base<NumericT> & result)
01429       {
01430         assert( (viennacl::traits::size1(mat_trans) == viennacl::traits::size(result)) && bool("Size check failed for transposed matrix-vector product: size1(A^T) == size(result)"));
01431         assert( (viennacl::traits::size2(mat_trans) == viennacl::traits::size(vec)) && bool("Size check failed for transposed matrix-vector product: size2(A^T) == size(x)"));  //remember: mat is transposed!
01432 
01433         typedef NumericT    value_type;
01434 
01435 
01436         // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
01437         assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!"));
01438 
01439         if (viennacl::is_row_major<F>::value)
01440         {
01441           trans_vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
01442                                                  static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())),
01443                                                  static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())),
01444                                                  static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())),
01445                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())),
01446 
01447                                                  detail::cuda_arg<value_type>(vec),
01448                                                  static_cast<unsigned int>(viennacl::traits::start(vec)),
01449                                                  static_cast<unsigned int>(viennacl::traits::stride(vec)),
01450                                                  static_cast<unsigned int>(viennacl::traits::size(vec)),
01451 
01452                                                  detail::cuda_arg<value_type>(result),
01453                                                  static_cast<unsigned int>(viennacl::traits::start(result)),
01454                                                  static_cast<unsigned int>(viennacl::traits::stride(result)),
01455                                                  static_cast<unsigned int>(viennacl::traits::size(result))
01456                                                 );
01457           VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_row_kernel");
01458         }
01459         else
01460         {
01461           trans_vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat_trans.lhs()),
01462                                                  static_cast<unsigned int>(viennacl::traits::start1(mat_trans.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(mat_trans.lhs())),
01463                                                  static_cast<unsigned int>(viennacl::traits::stride1(mat_trans.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(mat_trans.lhs())),
01464                                                  static_cast<unsigned int>(viennacl::traits::size1(mat_trans.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(mat_trans.lhs())),
01465                                                  static_cast<unsigned int>(viennacl::traits::internal_size1(mat_trans.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(mat_trans.lhs())),
01466 
01467                                                  detail::cuda_arg<value_type>(vec),
01468                                                  static_cast<unsigned int>(viennacl::traits::start(vec)),
01469                                                  static_cast<unsigned int>(viennacl::traits::stride(vec)),
01470                                                  static_cast<unsigned int>(viennacl::traits::size(vec)),
01471 
01472                                                  detail::cuda_arg<value_type>(result),
01473                                                  static_cast<unsigned int>(viennacl::traits::start(result)),
01474                                                  static_cast<unsigned int>(viennacl::traits::stride(result)),
01475                                                  static_cast<unsigned int>(viennacl::traits::size(result))
01476                                                 );
01477           VIENNACL_CUDA_LAST_ERROR_CHECK("trans_vec_mul_col_kernel");
01478         }
01479       }
01480 
01481 
01482       //
01484       //
01485 
01486       namespace detail
01487       {
01488         // C = A * B and possibly transposed variants
01489         template <typename T1, typename T2, typename T3, typename ScalarType >
01490         void prod_slow_kernel(const T1 & A, bool transposed_A,
01491                               const T2 & B, bool transposed_B,
01492                               T3 & C,
01493                               ScalarType alpha,
01494                               ScalarType beta)
01495         {
01496           typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
01497 
01498           cpu_value_type converted_alpha = static_cast<cpu_value_type>(alpha);
01499           cpu_value_type converted_beta  = static_cast<cpu_value_type>(beta);
01500 
01501           dim3 threads(16, 16);
01502           dim3 grid( (viennacl::traits::size1(C) - 1) / 16 + 1,
01503                      (viennacl::traits::size2(C) - 1) / 16 + 1);
01504 
01505           bool row_major_A = viennacl::is_row_major<T1>::value;
01506           bool row_major_B = viennacl::is_row_major<T2>::value;
01507           bool row_major_C = viennacl::is_row_major<T3>::value;
01508 
01509 
01510           if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
01511           {
01512             matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>>
01513               (converted_alpha,
01514                 detail::cuda_arg<cpu_value_type>(A),
01515                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01516                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01517                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01518                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01519 
01520                 detail::cuda_arg<cpu_value_type>(B),
01521                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01522                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01523                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01524                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01525 
01526                 converted_beta,
01527                 detail::cuda_arg<cpu_value_type>(C),
01528                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01529                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01530                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01531                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01532           }
01533           else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
01534           {
01535             matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>>
01536               (converted_alpha,
01537                 detail::cuda_arg<cpu_value_type>(A),
01538                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01539                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01540                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01541                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01542 
01543                 detail::cuda_arg<cpu_value_type>(B),
01544                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01545                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01546                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01547                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01548 
01549                 converted_beta,
01550                 detail::cuda_arg<cpu_value_type>(C),
01551                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01552                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01553                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01554                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01555           }
01556           else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
01557           {
01558             matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>>
01559               (converted_alpha,
01560                 detail::cuda_arg<cpu_value_type>(A),
01561                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01562                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01563                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01564                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01565 
01566                 detail::cuda_arg<cpu_value_type>(B),
01567                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01568                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01569                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01570                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01571 
01572                 converted_beta,
01573                 detail::cuda_arg<cpu_value_type>(C),
01574                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01575                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01576                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01577                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01578           }
01579           else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
01580           {
01581             matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>>
01582               (converted_alpha,
01583                 detail::cuda_arg<cpu_value_type>(A),
01584                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01585                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01586                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01587                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01588 
01589                 detail::cuda_arg<cpu_value_type>(B),
01590                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01591                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01592                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01593                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01594 
01595                 converted_beta,
01596                 detail::cuda_arg<cpu_value_type>(C),
01597                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01598                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01599                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01600                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01601           }
01603 
01604           else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
01605           {
01606             matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>>
01607               (converted_alpha,
01608                 detail::cuda_arg<cpu_value_type>(A),
01609                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01610                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01611                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01612                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01613 
01614                 detail::cuda_arg<cpu_value_type>(B),
01615                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01616                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01617                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01618                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01619 
01620                 converted_beta,
01621                 detail::cuda_arg<cpu_value_type>(C),
01622                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01623                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01624                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01625                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01626           }
01627           else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
01628           {
01629             matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>>
01630               (converted_alpha,
01631                 detail::cuda_arg<cpu_value_type>(A),
01632                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01633                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01634                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01635                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01636 
01637                 detail::cuda_arg<cpu_value_type>(B),
01638                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01639                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01640                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01641                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01642 
01643                 converted_beta,
01644                 detail::cuda_arg<cpu_value_type>(C),
01645                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01646                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01647                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01648                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01649           }
01650           else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
01651           {
01652             matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>>
01653               (converted_alpha,
01654                 detail::cuda_arg<cpu_value_type>(A),
01655                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01656                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01657                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01658                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01659 
01660                 detail::cuda_arg<cpu_value_type>(B),
01661                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01662                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01663                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01664                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01665 
01666                 converted_beta,
01667                 detail::cuda_arg<cpu_value_type>(C),
01668                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01669                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01670                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01671                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01672           }
01673           else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
01674           {
01675             matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>>
01676               (converted_alpha,
01677                 detail::cuda_arg<cpu_value_type>(A),
01678                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01679                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01680                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01681                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01682 
01683                 detail::cuda_arg<cpu_value_type>(B),
01684                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01685                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01686                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01687                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01688 
01689                 converted_beta,
01690                 detail::cuda_arg<cpu_value_type>(C),
01691                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01692                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01693                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01694                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01695           }
01697 
01698           else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
01699           {
01700             matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>>
01701               (converted_alpha,
01702                 detail::cuda_arg<cpu_value_type>(A),
01703                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01704                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01705                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01706                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01707 
01708                 detail::cuda_arg<cpu_value_type>(B),
01709                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01710                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01711                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01712                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01713 
01714                 converted_beta,
01715                 detail::cuda_arg<cpu_value_type>(C),
01716                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01717                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01718                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01719                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01720           }
01721           else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
01722           {
01723             matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>>
01724               (converted_alpha,
01725                 detail::cuda_arg<cpu_value_type>(A),
01726                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01727                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01728                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01729                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01730 
01731                 detail::cuda_arg<cpu_value_type>(B),
01732                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01733                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01734                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01735                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01736 
01737                 converted_beta,
01738                 detail::cuda_arg<cpu_value_type>(C),
01739                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01740                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01741                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01742                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01743           }
01744           else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
01745           {
01746             matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>>
01747               (converted_alpha,
01748                 detail::cuda_arg<cpu_value_type>(A),
01749                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01750                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01751                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01752                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01753 
01754                 detail::cuda_arg<cpu_value_type>(B),
01755                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01756                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01757                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01758                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01759 
01760                 converted_beta,
01761                 detail::cuda_arg<cpu_value_type>(C),
01762                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01763                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01764                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01765                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01766           }
01767           else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
01768           {
01769             matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>>
01770               (converted_alpha,
01771                 detail::cuda_arg<cpu_value_type>(A),
01772                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01773                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01774                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01775                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01776 
01777                 detail::cuda_arg<cpu_value_type>(B),
01778                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01779                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01780                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01781                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01782 
01783                 converted_beta,
01784                 detail::cuda_arg<cpu_value_type>(C),
01785                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01786                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01787                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01788                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01789           }
01791 
01792           else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
01793           {
01794             matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>>
01795               (converted_alpha,
01796                 detail::cuda_arg<cpu_value_type>(A),
01797                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01798                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01799                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01800                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01801 
01802                 detail::cuda_arg<cpu_value_type>(B),
01803                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01804                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01805                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01806                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01807 
01808                 converted_beta,
01809                 detail::cuda_arg<cpu_value_type>(C),
01810                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01811                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01812                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01813                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01814           }
01815           else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
01816           {
01817             matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>>
01818               (converted_alpha,
01819                 detail::cuda_arg<cpu_value_type>(A),
01820                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01821                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01822                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01823                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01824 
01825                 detail::cuda_arg<cpu_value_type>(B),
01826                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01827                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01828                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01829                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01830 
01831                 converted_beta,
01832                 detail::cuda_arg<cpu_value_type>(C),
01833                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01834                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01835                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01836                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01837           }
01838           else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
01839           {
01840             matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>>
01841               (converted_alpha,
01842                 detail::cuda_arg<cpu_value_type>(A),
01843                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01844                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01845                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01846                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01847 
01848                 detail::cuda_arg<cpu_value_type>(B),
01849                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01850                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01851                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01852                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01853 
01854                 converted_beta,
01855                 detail::cuda_arg<cpu_value_type>(C),
01856                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01857                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01858                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01859                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01860           }
01861           else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
01862           {
01863             matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>>
01864               (converted_alpha,
01865                 detail::cuda_arg<cpu_value_type>(A),
01866                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01867                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01868                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01869                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01870 
01871                 detail::cuda_arg<cpu_value_type>(B),
01872                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01873                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01874                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01875                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01876 
01877                 converted_beta,
01878                 detail::cuda_arg<cpu_value_type>(C),
01879                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01880                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01881                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01882                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01883           }
01885 
01886           else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
01887           {
01888             matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>>
01889               (converted_alpha,
01890                 detail::cuda_arg<cpu_value_type>(A),
01891                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01892                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01893                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01894                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01895 
01896                 detail::cuda_arg<cpu_value_type>(B),
01897                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01898                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01899                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01900                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01901 
01902                 converted_beta,
01903                 detail::cuda_arg<cpu_value_type>(C),
01904                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01905                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01906                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01907                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01908           }
01909           else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
01910           {
01911             matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>>
01912               (converted_alpha,
01913                 detail::cuda_arg<cpu_value_type>(A),
01914                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01915                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01916                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01917                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01918 
01919                 detail::cuda_arg<cpu_value_type>(B),
01920                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01921                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01922                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01923                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01924 
01925                 converted_beta,
01926                 detail::cuda_arg<cpu_value_type>(C),
01927                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01928                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01929                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01930                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01931           }
01932           else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
01933           {
01934             matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>>
01935               (converted_alpha,
01936                 detail::cuda_arg<cpu_value_type>(A),
01937                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01938                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01939                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01940                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01941 
01942                 detail::cuda_arg<cpu_value_type>(B),
01943                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01944                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01945                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01946                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01947 
01948                 converted_beta,
01949                 detail::cuda_arg<cpu_value_type>(C),
01950                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01951                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01952                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01953                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01954           }
01955           else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
01956           {
01957             matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>>
01958               (converted_alpha,
01959                 detail::cuda_arg<cpu_value_type>(A),
01960                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01961                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01962                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01963                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01964 
01965                 detail::cuda_arg<cpu_value_type>(B),
01966                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01967                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01968                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01969                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01970 
01971                 converted_beta,
01972                 detail::cuda_arg<cpu_value_type>(C),
01973                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01974                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
01975                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
01976                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
01977           }
01979 
01980           else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
01981           {
01982             matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>>
01983               (converted_alpha,
01984                 detail::cuda_arg<cpu_value_type>(A),
01985                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
01986                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
01987                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
01988                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
01989 
01990                 detail::cuda_arg<cpu_value_type>(B),
01991                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
01992                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
01993                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
01994                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
01995 
01996                 converted_beta,
01997                 detail::cuda_arg<cpu_value_type>(C),
01998                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
01999                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02000                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02001                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02002           }
02003           else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
02004           {
02005             matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>>
02006               (converted_alpha,
02007                 detail::cuda_arg<cpu_value_type>(A),
02008                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02009                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02010                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02011                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02012 
02013                 detail::cuda_arg<cpu_value_type>(B),
02014                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02015                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02016                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02017                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02018 
02019                 converted_beta,
02020                 detail::cuda_arg<cpu_value_type>(C),
02021                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02022                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02023                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02024                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02025           }
02026           else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
02027           {
02028             matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>>
02029               (converted_alpha,
02030                 detail::cuda_arg<cpu_value_type>(A),
02031                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02032                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02033                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02034                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02035 
02036                 detail::cuda_arg<cpu_value_type>(B),
02037                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02038                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02039                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02040                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02041 
02042                 converted_beta,
02043                 detail::cuda_arg<cpu_value_type>(C),
02044                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02045                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02046                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02047                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02048           }
02049           else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
02050           {
02051             matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>>
02052               (converted_alpha,
02053                 detail::cuda_arg<cpu_value_type>(A),
02054                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02055                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02056                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02057                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02058 
02059                 detail::cuda_arg<cpu_value_type>(B),
02060                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02061                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02062                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02063                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02064 
02065                 converted_beta,
02066                 detail::cuda_arg<cpu_value_type>(C),
02067                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02068                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02069                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02070                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02071           }
02073 
02074           else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
02075           {
02076             matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>>
02077               (converted_alpha,
02078                 detail::cuda_arg<cpu_value_type>(A),
02079                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02080                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02081                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02082                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02083 
02084                 detail::cuda_arg<cpu_value_type>(B),
02085                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02086                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02087                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02088                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02089 
02090                 converted_beta,
02091                 detail::cuda_arg<cpu_value_type>(C),
02092                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02093                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02094                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02095                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02096           }
02097           else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
02098           {
02099             matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>>
02100               (converted_alpha,
02101                 detail::cuda_arg<cpu_value_type>(A),
02102                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02103                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02104                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02105                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02106 
02107                 detail::cuda_arg<cpu_value_type>(B),
02108                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02109                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02110                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02111                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02112 
02113                 converted_beta,
02114                 detail::cuda_arg<cpu_value_type>(C),
02115                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02116                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02117                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02118                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02119           }
02120           else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
02121           {
02122             matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>>
02123               (converted_alpha,
02124                 detail::cuda_arg<cpu_value_type>(A),
02125                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02126                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02127                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02128                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02129 
02130                 detail::cuda_arg<cpu_value_type>(B),
02131                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02132                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02133                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02134                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02135 
02136                 converted_beta,
02137                 detail::cuda_arg<cpu_value_type>(C),
02138                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02139                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02140                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02141                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02142           }
02143           else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
02144           {
02145             matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>>
02146               (converted_alpha,
02147                 detail::cuda_arg<cpu_value_type>(A),
02148                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02149                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02150                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02151                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02152 
02153                 detail::cuda_arg<cpu_value_type>(B),
02154                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02155                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02156                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02157                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02158 
02159                 converted_beta,
02160                 detail::cuda_arg<cpu_value_type>(C),
02161                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02162                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02163                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02164                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02165           }
02166 
02167 
02169 
02170           else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
02171           {
02172             matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>>
02173               (converted_alpha,
02174                 detail::cuda_arg<cpu_value_type>(A),
02175                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02176                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02177                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02178                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02179 
02180                 detail::cuda_arg<cpu_value_type>(B),
02181                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02182                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02183                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02184                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02185 
02186                 converted_beta,
02187                 detail::cuda_arg<cpu_value_type>(C),
02188                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02189                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02190                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02191                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02192           }
02193           else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
02194           {
02195             matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>>
02196               (converted_alpha,
02197                 detail::cuda_arg<cpu_value_type>(A),
02198                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02199                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02200                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02201                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02202 
02203                 detail::cuda_arg<cpu_value_type>(B),
02204                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02205                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02206                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02207                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02208 
02209                 converted_beta,
02210                 detail::cuda_arg<cpu_value_type>(C),
02211                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02212                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02213                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02214                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02215           }
02216           else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
02217           {
02218             matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>>
02219               (converted_alpha,
02220                 detail::cuda_arg<cpu_value_type>(A),
02221                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02222                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02223                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02224                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02225 
02226                 detail::cuda_arg<cpu_value_type>(B),
02227                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02228                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02229                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02230                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02231 
02232                 converted_beta,
02233                 detail::cuda_arg<cpu_value_type>(C),
02234                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02235                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02236                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02237                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02238           }
02239           else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
02240           {
02241             matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>>
02242               (converted_alpha,
02243                 detail::cuda_arg<cpu_value_type>(A),
02244                 static_cast<unsigned int>(viennacl::traits::start1(A)),         static_cast<unsigned int>(viennacl::traits::start2(A)),
02245                 static_cast<unsigned int>(viennacl::traits::stride1(A)),        static_cast<unsigned int>(viennacl::traits::stride2(A)),
02246                 static_cast<unsigned int>(viennacl::traits::size1(A)),          static_cast<unsigned int>(viennacl::traits::size2(A)),
02247                 static_cast<unsigned int>(viennacl::traits::internal_size1(A)), static_cast<unsigned int>(viennacl::traits::internal_size2(A)),
02248 
02249                 detail::cuda_arg<cpu_value_type>(B),
02250                 static_cast<unsigned int>(viennacl::traits::start1(B)),         static_cast<unsigned int>(viennacl::traits::start2(B)),
02251                 static_cast<unsigned int>(viennacl::traits::stride1(B)),        static_cast<unsigned int>(viennacl::traits::stride2(B)),
02252                 static_cast<unsigned int>(viennacl::traits::size1(B)),          static_cast<unsigned int>(viennacl::traits::size2(B)),
02253                 static_cast<unsigned int>(viennacl::traits::internal_size1(B)), static_cast<unsigned int>(viennacl::traits::internal_size2(B)),
02254 
02255                 converted_beta,
02256                 detail::cuda_arg<cpu_value_type>(C),
02257                 static_cast<unsigned int>(viennacl::traits::start1(C)),         static_cast<unsigned int>(viennacl::traits::start2(C)),
02258                 static_cast<unsigned int>(viennacl::traits::stride1(C)),        static_cast<unsigned int>(viennacl::traits::stride2(C)),
02259                 static_cast<unsigned int>(viennacl::traits::size1(C)),          static_cast<unsigned int>(viennacl::traits::size2(C)),
02260                 static_cast<unsigned int>(viennacl::traits::internal_size1(C)), static_cast<unsigned int>(viennacl::traits::internal_size2(C)) );
02261           }
02262 
02263         }
02264 
02265         // C = A * B, using fast kernel
02266         template <typename T1, typename T2, typename T3, typename ScalarType >
02267         void prod_fast_kernel(const T1 & A,
02268                               const T2 & B,
02269                               T3 & C,
02270                               ScalarType alpha,
02271                               ScalarType beta,
02272                               std::string kernel_name)
02273         {
02274           typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
02275 
02276           cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
02277           cpu_value_type cl_beta  = static_cast<cpu_value_type>(beta);
02278 
02279           /*viennacl::ocl::enqueue(k(cl_alpha,
02280                                   viennacl::traits::opencl_handle(A),
02281                                   cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
02282                                   cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
02283                                   cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
02284                                   cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
02285 
02286                                   viennacl::traits::opencl_handle(B),
02287                                   cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)),
02288                                   cl_uint(viennacl::traits::stride1(B)),          cl_uint(viennacl::traits::stride2(B)),
02289                                   cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
02290                                   cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
02291 
02292                                   cl_beta,
02293                                   viennacl::traits::opencl_handle(C),
02294                                   cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)),
02295                                   cl_uint(viennacl::traits::stride1(C)),          cl_uint(viennacl::traits::stride2(C)),
02296                                   cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
02297                                   cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
02298                                   )
02299                                 );*/
02300 
02301           throw "not implemented yet";
02302         }
02303 
02304         template <typename T1, typename T2, typename T3, typename ScalarType >
02305         void prod(const T1 & A, bool transposed_A,
02306                   const T2 & B, bool transposed_B,
02307                   T3 & C,
02308                   ScalarType alpha,
02309                   ScalarType beta)
02310         {
02311           if (   (viennacl::traits::size1(A) < 64)
02312               || (viennacl::traits::size2(A) < 64)
02313               || (viennacl::traits::size1(B) < 64) )   //there is most likely not enough to compute, rendering kernel launch overhead considerable
02314           {
02315             prod_slow_kernel(A, transposed_A,
02316                              B, transposed_B,
02317                              C, alpha, beta);
02318           }
02319           /*else if (   (viennacl::traits::size1(A) % 64 == 0)
02320                   && (viennacl::traits::size2(A) % 64 == 0)
02321                   && (viennacl::traits::size1(B) % 64 == 0) )   // allows the use of the fast kernel only
02322           {
02323             prod_fast_kernel(A, B, C, alpha, beta);
02324             //prod_slow_kernel(A, B, C, slow_kernel_name);
02325           }*/
02326           else //TODO: use four kernels
02327           {
02328             prod_slow_kernel(A, transposed_A,
02329                              B, transposed_B,
02330                              C, alpha, beta);
02331           }
02332 
02333         }
02334       } // namespace detail
02335 
02336 
02342       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
02343       void prod_impl(const matrix_base<NumericT, F1> & A,
02344                      const matrix_base<NumericT, F2> & B,
02345                            matrix_base<NumericT, F3> & C,
02346                      ScalarType alpha,
02347                      ScalarType beta)
02348       {
02349         assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, B): size1(A) != size1(C)"));
02350         assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(A, B): size2(A) != size1(B)"));
02351         assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, B): size2(B) != size2(C)"));
02352 
02353         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
02354         /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A))
02355               && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
02356               && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
02357 
02358 
02359         detail::prod(A, false,
02360                      B, false,
02361                      C, alpha, beta);
02362       }
02363 
02364 
02365 
02371       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
02372       void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
02373                                                         const matrix_base<NumericT, F1>,
02374                                                         op_trans> & A,
02375                      const matrix_base<NumericT, F2> & B,
02376                            matrix_base<NumericT, F3> & C,
02377                      ScalarType alpha,
02378                      ScalarType beta)
02379       {
02380         //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
02381         //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
02382         assert( (viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(A) != size1(C)"));
02383         assert( (viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(trans(A), B): size1(A) != size1(B)"));
02384         assert( (viennacl::traits::size2(B)       == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(B) != size2(C)"));
02385 
02386         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
02387         assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
02388               && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
02389               && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));
02390 
02391         detail::prod(A.lhs(), true,
02392                      B, false,
02393                      C, alpha, beta);
02394       }
02395 
02396 
02397 
02398 
02404       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
02405       void prod_impl(const matrix_base<NumericT, F1> & A,
02406                      const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
02407                            matrix_base<NumericT, F3> & C,
02408                      ScalarType alpha,
02409                      ScalarType beta)
02410       {
02411         assert( (viennacl::traits::size1(A)       == viennacl::traits::size1(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(A) != size1(C)"));
02412         assert( (viennacl::traits::size2(A)       == viennacl::traits::size2(B.lhs())) && bool("Size mismatch in C = prod(A, trans(B)): size2(A) != size2(B)"));
02413         assert( (viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(B) != size2(C)"));
02414 
02415         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
02416         detail::prod(A, false,
02417                      B.lhs(), true,
02418                      C, alpha, beta);
02419       }
02420 
02421 
02422 
02428       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
02429       void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
02430                      const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
02431                      matrix_base<NumericT, F3> & C,
02432                      ScalarType alpha,
02433                      ScalarType beta)
02434       {
02435         assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
02436         assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
02437         assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
02438 
02439         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
02440         assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
02441               && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs()))
02442               && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));
02443 
02444         detail::prod(A.lhs(), true,
02445                      B.lhs(), true,
02446                      C, alpha, beta);
02447       }
02448 
02449 
02450 
02451 
02452       //
02454       //
02455 
02456 
02469       template <typename NumericT, typename F, typename S1>
02470       void scaled_rank_1_update(matrix_base<NumericT, F> & mat1,
02471                                 S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
02472                                 const vector_base<NumericT> & vec1,
02473                                 const vector_base<NumericT> & vec2)
02474       {
02475         assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
02476         assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
02477 
02478         typedef NumericT        value_type;
02479 
02480         unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
02481 
02482         value_type temporary_alpha = 0;
02483         if (viennacl::is_cpu_scalar<S1>::value)
02484           temporary_alpha = alpha;
02485 
02486         if (viennacl::is_row_major<F>::value)
02487         {
02488           scaled_rank1_update_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
02489                                                        static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
02490                                                        static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
02491                                                        static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
02492                                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
02493 
02494                                                        detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
02495                                                        options_alpha,
02496 
02497                                                        detail::cuda_arg<value_type>(vec1),
02498                                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
02499                                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
02500                                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
02501 
02502                                                        detail::cuda_arg<value_type>(vec2),
02503                                                        static_cast<unsigned int>(viennacl::traits::start(vec2)),
02504                                                        static_cast<unsigned int>(viennacl::traits::stride(vec2)),
02505                                                        static_cast<unsigned int>(viennacl::traits::size(vec2))
02506                                                      );
02507           VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_row_kernel");
02508         }
02509         else
02510         {
02511           scaled_rank1_update_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
02512                                                        static_cast<unsigned int>(viennacl::traits::start1(mat1)),           static_cast<unsigned int>(viennacl::traits::start2(mat1)),
02513                                                        static_cast<unsigned int>(viennacl::traits::stride1(mat1)),          static_cast<unsigned int>(viennacl::traits::stride2(mat1)),
02514                                                        static_cast<unsigned int>(viennacl::traits::size1(mat1)),            static_cast<unsigned int>(viennacl::traits::size2(mat1)),
02515                                                        static_cast<unsigned int>(viennacl::traits::internal_size1(mat1)),   static_cast<unsigned int>(viennacl::traits::internal_size2(mat1)),
02516 
02517                                                        detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
02518                                                        options_alpha,
02519 
02520                                                        detail::cuda_arg<value_type>(vec1),
02521                                                        static_cast<unsigned int>(viennacl::traits::start(vec1)),
02522                                                        static_cast<unsigned int>(viennacl::traits::stride(vec1)),
02523                                                        static_cast<unsigned int>(viennacl::traits::size(vec1)),
02524 
02525                                                        detail::cuda_arg<value_type>(vec2),
02526                                                        static_cast<unsigned int>(viennacl::traits::start(vec2)),
02527                                                        static_cast<unsigned int>(viennacl::traits::stride(vec2)),
02528                                                        static_cast<unsigned int>(viennacl::traits::size(vec2))
02529                                                       );
02530           VIENNACL_CUDA_LAST_ERROR_CHECK("scaled_rank1_update_col_kernel");
02531         }
02532       }
02533 
02534     } // namespace opencl
02535   } //namespace linalg
02536 } //namespace viennacl
02537 
02538 
02539 #endif