ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/opencl/matrix_operations.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
00002 #define VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include "viennacl/forwards.h"
00026 #include "viennacl/ocl/device.hpp"
00027 #include "viennacl/ocl/handle.hpp"
00028 #include "viennacl/ocl/kernel.hpp"
00029 #include "viennacl/scalar.hpp"
00030 #include "viennacl/vector.hpp"
00031 #include "viennacl/vector_proxy.hpp"
00032 #include "viennacl/tools/tools.hpp"
00033 #include "viennacl/meta/enable_if.hpp"
00034 #include "viennacl/meta/predicate.hpp"
00035 #include "viennacl/meta/result_of.hpp"
00036 
00037 #include "viennacl/scheduler/forwards.h"
00038 
00039 #include "viennacl/generator/generate.hpp"
00040 
00041 #include "viennacl/traits/size.hpp"
00042 #include "viennacl/traits/start.hpp"
00043 #include "viennacl/traits/handle.hpp"
00044 #include "viennacl/traits/stride.hpp"
00045 
00046 #include "viennacl/linalg/opencl/common.hpp"
00047 
00048 #include "viennacl/linalg/opencl/kernels/matrix.hpp"
00049 #include "viennacl/linalg/opencl/kernels/matrix_element.hpp"
00050 
00051 #include "viennacl/linalg/opencl/kernels/matrix_prod.hpp"
00052 
00053 
00054 namespace viennacl
00055 {
00056   namespace linalg
00057   {
00058     namespace opencl
00059     {
00060       //
00061       // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
00062       //
00063 
00064       template <typename NumericT, typename F,
00065                 typename ScalarType1>
00066       void am(matrix_base<NumericT, F> & mat1,
00067               matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
00068       {
00069         typedef NumericT        value_type;
00070 
00071         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
00072         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00073         KernelClass::init(ctx);
00074 
00075         cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00076 
00077         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(),
00078                                                    (viennacl::is_cpu_scalar<ScalarType1>::value ? "am_cpu" : "am_gpu"));
00079         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
00080                                 cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
00081                                 cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
00082                                 cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
00083                                 cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
00084 
00085                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
00086                                 options_alpha,
00087                                 viennacl::traits::opencl_handle(mat2),
00088                                 cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
00089                                 cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
00090                                 cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2))
00091                                 )
00092                               );
00093       }
00094 
00095 
00096       template <typename NumericT, typename F,
00097                 typename ScalarType1, typename ScalarType2>
00098       void ambm(matrix_base<NumericT, F> & mat1,
00099                 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00100                 matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00101       {
00102         typedef NumericT        value_type;
00103 
00104         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
00105         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00106         KernelClass::init(ctx);
00107 
00108         std::string kernel_name;
00109         if      ( viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
00110           kernel_name = "ambm_cpu_cpu";
00111         else if ( viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
00112           kernel_name = "ambm_cpu_gpu";
00113         else if (!viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
00114           kernel_name = "ambm_gpu_cpu";
00115         else
00116           kernel_name = "ambm_gpu_gpu";
00117 
00118         cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00119         cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00120 
00121         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
00122         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
00123                                 cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
00124                                 cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
00125                                 cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
00126                                 cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
00127 
00128                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
00129                                 options_alpha,
00130                                 viennacl::traits::opencl_handle(mat2),
00131                                 cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
00132                                 cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
00133                                 cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
00134 
00135                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
00136                                 options_beta,
00137                                 viennacl::traits::opencl_handle(mat3),
00138                                 cl_uint(viennacl::traits::start1(mat3)),           cl_uint(viennacl::traits::start2(mat3)),
00139                                 cl_uint(viennacl::traits::stride1(mat3)),          cl_uint(viennacl::traits::stride2(mat3)),
00140                                 cl_uint(viennacl::traits::internal_size1(mat3)),   cl_uint(viennacl::traits::internal_size2(mat3))
00141                                 )
00142                               );
00143       }
00144 
00145 
00146       template <typename NumericT, typename F,
00147                 typename ScalarType1, typename ScalarType2>
00148       void ambm_m(matrix_base<NumericT, F> & mat1,
00149                   matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00150                   matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00151       {
00152         typedef NumericT        value_type;
00153 
00154         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
00155         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00156         KernelClass::init(ctx);
00157 
00158         std::string kernel_name;
00159         if      ( viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
00160           kernel_name = "ambm_m_cpu_cpu";
00161         else if ( viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
00162           kernel_name = "ambm_m_cpu_gpu";
00163         else if (!viennacl::is_cpu_scalar<ScalarType1>::value &&  viennacl::is_cpu_scalar<ScalarType2>::value)
00164           kernel_name = "ambm_m_gpu_cpu";
00165         else
00166           kernel_name = "ambm_m_gpu_gpu";
00167 
00168         cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00169         cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00170 
00171         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
00172         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
00173                                 cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
00174                                 cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
00175                                 cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
00176                                 cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
00177 
00178                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)),
00179                                 options_alpha,
00180                                 viennacl::traits::opencl_handle(mat2),
00181                                 cl_uint(viennacl::traits::start1(mat2)),           cl_uint(viennacl::traits::start2(mat2)),
00182                                 cl_uint(viennacl::traits::stride1(mat2)),          cl_uint(viennacl::traits::stride2(mat2)),
00183                                 cl_uint(viennacl::traits::internal_size1(mat2)),   cl_uint(viennacl::traits::internal_size2(mat2)),
00184 
00185                                 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)),
00186                                 options_beta,
00187                                 viennacl::traits::opencl_handle(mat3),
00188                                 cl_uint(viennacl::traits::start1(mat3)),           cl_uint(viennacl::traits::start2(mat3)),
00189                                 cl_uint(viennacl::traits::stride1(mat3)),          cl_uint(viennacl::traits::stride2(mat3)),
00190                                 cl_uint(viennacl::traits::internal_size1(mat3)),   cl_uint(viennacl::traits::internal_size2(mat3))
00191                                 )
00192                               );
00193       }
00194 
00195 
00196 
00197       template <typename NumericT, typename F>
00198       void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false)
00199       {
00200         typedef NumericT        value_type;
00201 
00202         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00203         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00204         KernelClass::init(ctx);
00205 
00206         value_type alpha = static_cast<value_type>(s);
00207 
00208         cl_uint s1 = clear ? cl_uint(viennacl::traits::internal_size1(mat)) : cl_uint(viennacl::traits::size1(mat));
00209         cl_uint s2 = clear ? cl_uint(viennacl::traits::internal_size2(mat)) : cl_uint(viennacl::traits::size2(mat));
00210 
00211         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "assign_cpu");
00212         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
00213                                  cl_uint(viennacl::traits::start1(mat)),           cl_uint(viennacl::traits::start2(mat)),
00214                                  cl_uint(viennacl::traits::stride1(mat)),          cl_uint(viennacl::traits::stride2(mat)),
00215                                  s1,                                               s2,
00216                                  cl_uint(viennacl::traits::internal_size1(mat)),   cl_uint(viennacl::traits::internal_size2(mat)),
00217                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha))
00218                                 )
00219                               );
00220       }
00221 
00222       template <typename NumericT, typename F>
00223       void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s)
00224       {
00225         typedef NumericT        value_type;
00226 
00227         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00228         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00229         KernelClass::init(ctx);
00230 
00231         value_type alpha = static_cast<value_type>(s);
00232 
00233         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "diagonal_assign_cpu");
00234         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
00235                                  cl_uint(viennacl::traits::start1(mat)),           cl_uint(viennacl::traits::start2(mat)),
00236                                  cl_uint(viennacl::traits::stride1(mat)),          cl_uint(viennacl::traits::stride2(mat)),
00237                                  cl_uint(viennacl::traits::size1(mat)),            cl_uint(viennacl::traits::size2(mat)),
00238                                  cl_uint(viennacl::traits::internal_size1(mat)),   cl_uint(viennacl::traits::internal_size2(mat)),
00239                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha))
00240                                 )
00241                               );
00242       }
00243 
00244       template <typename NumericT, typename F>
00245       void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT, F> & mat)
00246       {
00247         // Step 1: set everything to zero
00248         matrix_assign(mat, NumericT(0));
00249 
00250         // Step 2: set the diagonal:
00251 
00252         // reuse vector ambm kernel for assigning the elements:
00253         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00254         typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
00255         KernelClass::init(ctx);
00256 
00257         cl_uint options_alpha = 0;
00258         viennacl::ocl::packed_cl_uint size_mat;
00259         if (viennacl::is_row_major<F>::value)
00260         {
00261           vcl_size_t first_row_index = 0;
00262           vcl_size_t first_col_index = 0;
00263           if (k < 0)
00264             first_row_index = vcl_size_t(-k);
00265           else
00266             first_col_index = vcl_size_t(k);
00267           size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
00268                                     + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
00269           size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
00270           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00271           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00272         }
00273         else
00274         {
00275           vcl_size_t first_row_index = 0;
00276           vcl_size_t first_col_index = 0;
00277           if (k < 0)
00278             first_row_index = vcl_size_t(-k);
00279           else
00280             first_col_index = vcl_size_t(k);
00281           size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
00282                                     + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
00283           size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
00284           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00285           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00286         }
00287 
00288         viennacl::ocl::packed_cl_uint size_vec;
00289         size_vec.start  = cl_uint(viennacl::traits::start(vec));
00290         size_vec.stride = cl_uint(viennacl::traits::stride(vec));
00291         size_vec.size   = cl_uint(viennacl::traits::size(vec));
00292         size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00293 
00294         viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
00295         viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(mat),
00296                                     size_mat,
00297 
00298                                     viennacl::traits::opencl_handle(NumericT(1)),
00299                                     options_alpha,
00300                                     viennacl::traits::opencl_handle(vec),
00301                                     size_vec)
00302                               );
00303       }
00304 
00305       template <typename NumericT, typename F>
00306       void matrix_diag_to_vector(const matrix_base<NumericT, F> & mat, int k, vector_base<NumericT> & vec)
00307       {
00308         // reuse vector ambm kernel for assigning the elements:
00309         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00310         typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
00311         KernelClass::init(ctx);
00312 
00313         cl_uint options_alpha = 0;
00314         viennacl::ocl::packed_cl_uint size_mat;
00315         if (viennacl::is_row_major<F>::value)
00316         {
00317           vcl_size_t first_row_index = 0;
00318           vcl_size_t first_col_index = 0;
00319           if (k < 0)
00320             first_row_index = vcl_size_t(-k);
00321           else
00322             first_col_index = vcl_size_t(k);
00323           size_mat.start  = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat)
00324                                     + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat));
00325           size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat));
00326           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00327           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00328         }
00329         else
00330         {
00331           vcl_size_t first_row_index = 0;
00332           vcl_size_t first_col_index = 0;
00333           if (k < 0)
00334             first_row_index = vcl_size_t(-k);
00335           else
00336             first_col_index = vcl_size_t(k);
00337           size_mat.start  = cl_uint(   viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)
00338                                     + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
00339           size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat));
00340           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00341           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00342         }
00343 
00344         viennacl::ocl::packed_cl_uint size_vec;
00345         size_vec.start  = cl_uint(viennacl::traits::start(vec));
00346         size_vec.stride = cl_uint(viennacl::traits::stride(vec));
00347         size_vec.size   = cl_uint(viennacl::traits::size(vec));
00348         size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00349 
00350 
00351         viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
00352         viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
00353                                     size_vec,
00354 
00355                                     viennacl::traits::opencl_handle(NumericT(1)),
00356                                     options_alpha,
00357                                     viennacl::traits::opencl_handle(mat),
00358                                     size_mat)
00359                               );
00360       }
00361 
00362       template <typename NumericT, typename F>
00363       void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec)
00364       {
00365         // reuse vector ambm kernel for assigning the elements:
00366         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00367         typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
00368         KernelClass::init(ctx);
00369 
00370         cl_uint options_alpha = 0;
00371         viennacl::ocl::packed_cl_uint size_mat;
00372         if (viennacl::is_row_major<F>::value)
00373         {
00374           size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat));
00375           size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
00376           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00377           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00378         }
00379         else
00380         {
00381           size_mat.start  = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat));
00382           size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat));
00383           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00384           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00385         }
00386 
00387         viennacl::ocl::packed_cl_uint size_vec;
00388         size_vec.start  = cl_uint(viennacl::traits::start(vec));
00389         size_vec.stride = cl_uint(viennacl::traits::stride(vec));
00390         size_vec.size   = cl_uint(viennacl::traits::size(vec));
00391         size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00392 
00393 
00394         viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
00395         viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
00396                                     size_vec,
00397 
00398                                     viennacl::traits::opencl_handle(NumericT(1)),
00399                                     options_alpha,
00400                                     viennacl::traits::opencl_handle(mat),
00401                                     size_mat)
00402                               );
00403       }
00404 
00405       template <typename NumericT, typename F>
00406       void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec)
00407       {
00408         // reuse vector ambm kernel for assigning the elements:
00409         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00410         typedef viennacl::linalg::opencl::kernels::vector<NumericT>  KernelClass;
00411         KernelClass::init(ctx);
00412 
00413         cl_uint options_alpha = 0;
00414         viennacl::ocl::packed_cl_uint size_mat;
00415         if (viennacl::is_row_major<F>::value)
00416         {
00417           size_mat.start  = cl_uint(viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat));
00418           size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat));
00419           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00420           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00421         }
00422         else
00423         {
00424           size_mat.start  = cl_uint(viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat));
00425           size_mat.stride = cl_uint(viennacl::traits::stride2(mat));
00426           size_mat.size   = cl_uint(viennacl::traits::size(vec));
00427           size_mat.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00428         }
00429 
00430         viennacl::ocl::packed_cl_uint size_vec;
00431         size_vec.start  = cl_uint(viennacl::traits::start(vec));
00432         size_vec.stride = cl_uint(viennacl::traits::stride(vec));
00433         size_vec.size   = cl_uint(viennacl::traits::size(vec));
00434         size_vec.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00435 
00436 
00437         viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu");
00438         viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec),
00439                                     size_vec,
00440 
00441                                     viennacl::traits::opencl_handle(NumericT(1)),
00442                                     options_alpha,
00443                                     viennacl::traits::opencl_handle(mat),
00444                                     size_mat)
00445                               );
00446       }
00447 
00448 
00449       //
00451       //
00452 
00453       // Binary operations A = B .* C and A = B ./ C
00459       template <typename T, typename F, typename OP>
00460       void element_op(matrix_base<T, F> & A,
00461                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy)
00462       {
00463         assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00464         assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00465 
00466         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00467         typedef viennacl::linalg::opencl::kernels::matrix<T, F>  KernelClass;
00468         KernelClass::init(ctx);
00469 
00470         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "element_op");
00471 
00472         cl_uint op_type = 2; //0: product, 1: division, 2: power
00473         if (viennacl::is_division<OP>::value)
00474           op_type = 1;
00475         else if (viennacl::is_product<OP>::value)
00476           op_type = 0;
00477 
00478         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
00479                                 cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
00480                                 cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
00481                                 cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
00482                                 cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
00483 
00484                                 viennacl::traits::opencl_handle(proxy.lhs()),
00485                                 cl_uint(viennacl::traits::start1(proxy.lhs())),           cl_uint(viennacl::traits::start2(proxy.lhs())),
00486                                 cl_uint(viennacl::traits::stride1(proxy.lhs())),          cl_uint(viennacl::traits::stride2(proxy.lhs())),
00487                                 cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   cl_uint(viennacl::traits::internal_size2(proxy.lhs())),
00488 
00489                                 viennacl::traits::opencl_handle(proxy.rhs()),
00490                                 cl_uint(viennacl::traits::start1(proxy.rhs())),           cl_uint(viennacl::traits::start2(proxy.rhs())),
00491                                 cl_uint(viennacl::traits::stride1(proxy.rhs())),          cl_uint(viennacl::traits::stride2(proxy.rhs())),
00492                                 cl_uint(viennacl::traits::internal_size1(proxy.rhs())),   cl_uint(viennacl::traits::internal_size2(proxy.rhs())),
00493 
00494                                 op_type)
00495                               );
00496       }
00497 
00498 
00499       // Unary operations
00500 
00506       template <typename T, typename F, typename OP>
00507       void element_op(matrix_base<T, F> & A,
00508                       matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy)
00509       {
00510         assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00511         assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00512 
00513         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00514 
00515         viennacl::linalg::opencl::kernels::matrix_element<T, F>::init(ctx);
00516         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::matrix_element<T, F>::program_name(), detail::op_to_string(OP()) + "_assign");
00517 
00518         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A),
00519                                  cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
00520                                  cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
00521                                  cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
00522                                  cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
00523 
00524                                  viennacl::traits::opencl_handle(proxy.lhs()),
00525                                  cl_uint(viennacl::traits::start1(proxy.lhs())),           cl_uint(viennacl::traits::start2(proxy.lhs())),
00526                                  cl_uint(viennacl::traits::stride1(proxy.lhs())),          cl_uint(viennacl::traits::stride2(proxy.lhs())),
00527                                  cl_uint(viennacl::traits::internal_size1(proxy.lhs())),   cl_uint(viennacl::traits::internal_size2(proxy.lhs())))
00528                               );
00529       }
00530 
00531 
00532       //
00534       //
00535 
00536       // A * x
00537 
00546       template <typename NumericT, typename F>
00547       void prod_impl(const matrix_base<NumericT, F> & mat,
00548                      const vector_base<NumericT> & vec,
00549                            vector_base<NumericT> & result)
00550       {
00551         typedef NumericT        value_type;
00552 
00553         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context());
00554         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00555         KernelClass::init(ctx);
00556 
00557         assert(mat.size2() == vec.size());
00558         // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
00559         assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!"));
00560         //result.resize(mat.size1());
00561 
00562         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "vec_mul");
00563         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat),
00564                                 cl_uint(viennacl::traits::start1(mat)),         cl_uint(viennacl::traits::start2(mat)),
00565                                 cl_uint(viennacl::traits::stride1(mat)),        cl_uint(viennacl::traits::stride2(mat)),
00566                                 cl_uint(viennacl::traits::size1(mat)),          cl_uint(viennacl::traits::size2(mat)),
00567                                 cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)),
00568 
00569                                 viennacl::traits::opencl_handle(vec),
00570                                 cl_uint(viennacl::traits::start(vec)),
00571                                 cl_uint(viennacl::traits::stride(vec)),
00572                                 cl_uint(viennacl::traits::size(vec)),
00573 
00574                                 viennacl::traits::opencl_handle(result),
00575                                 cl_uint(viennacl::traits::start(result)),
00576                                 cl_uint(viennacl::traits::stride(result)),
00577                                 cl_uint(viennacl::traits::size(result)),
00578 
00579                                 viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size())
00580                               ) );
00581       }
00582 
00583 
00584       // trans(A) * x
00585 
00594       template <typename NumericT, typename F>
00595       void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans,
00596                      const vector_base<NumericT> & vec,
00597                            vector_base<NumericT> & result)
00598       {
00599         assert( (viennacl::traits::size1(mat_trans) == viennacl::traits::size(result)) && bool("Size check failed for transposed matrix-vector product: size1(A^T) == size(result)"));
00600         assert( (viennacl::traits::size2(mat_trans) == viennacl::traits::size(vec)) && bool("Size check failed for transposed matrix-vector product: size2(A^T) == size(x)"));  //remember: mat is transposed!
00601 
00602 
00603         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
00604         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00605         KernelClass::init(ctx);
00606 
00607 
00608         // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead
00609         assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!"));
00610 
00611         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "trans_vec_mul");
00612 
00613         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat_trans.lhs()),
00614                                 cl_uint(viennacl::traits::start1(mat_trans.lhs())),         cl_uint(viennacl::traits::start2(mat_trans.lhs())),
00615                                 cl_uint(viennacl::traits::stride1(mat_trans.lhs())),        cl_uint(viennacl::traits::stride2(mat_trans.lhs())),
00616                                 cl_uint(viennacl::traits::size1(mat_trans.lhs())),          cl_uint(viennacl::traits::size2(mat_trans.lhs())),
00617                                 cl_uint(viennacl::traits::internal_size1(mat_trans.lhs())), cl_uint(viennacl::traits::internal_size2(mat_trans.lhs())),
00618 
00619                                 viennacl::traits::opencl_handle(vec),
00620                                 cl_uint(viennacl::traits::start(vec)),
00621                                 cl_uint(viennacl::traits::stride(vec)),
00622                                 cl_uint(viennacl::traits::size(vec)),
00623 
00624                                 viennacl::traits::opencl_handle(result),
00625                                 cl_uint(viennacl::traits::start(result)),
00626                                 cl_uint(viennacl::traits::stride(result)),
00627                                 cl_uint(viennacl::traits::size(result)),
00628 
00629                                 viennacl::ocl::local_mem(sizeof(NumericT) * k.local_work_size())
00630                               ) );
00631       }
00632 
00633 
00634       //
00636       //
00637 
00638       namespace detail
00639       {
00640         // C = A * B and possibly transposed variants
00641         template <typename T1, typename T2, typename T3, typename ScalarType >
00642         void prod_slow_kernel(const T1 & A,
00643                               const T2 & B,
00644                               T3 & C,
00645                               ScalarType alpha,
00646                               ScalarType beta,
00647                               std::string kernel_name)
00648         {
00649           typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
00650           typedef typename viennacl::result_of::orientation_functor<T1>::type   orientation_A;
00651           typedef typename viennacl::result_of::orientation_functor<T2>::type   orientation_B;
00652           typedef typename viennacl::result_of::orientation_functor<T3>::type   orientation_C;
00653 
00654           viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00655 
00656           typedef viennacl::linalg::opencl::kernels::matrix_prod<cpu_value_type, orientation_A, orientation_B, orientation_C>    KernelClass;
00657           KernelClass::init(ctx);
00658 
00659           //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
00660           viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
00661 
00662           k.global_work_size(0, viennacl::tools::align_to_multiple<unsigned int>(static_cast<unsigned int>(viennacl::traits::size1(C)), 16));
00663           k.global_work_size(1, viennacl::tools::align_to_multiple<unsigned int>(static_cast<unsigned int>(viennacl::traits::size2(C)), 16));
00664           k.local_work_size(0, 16);
00665           k.local_work_size(1, 16);
00666 
00667           cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
00668           cpu_value_type cl_beta  = static_cast<cpu_value_type>(beta);
00669 
00670           viennacl::ocl::enqueue(k(cl_alpha,
00671                                   viennacl::traits::opencl_handle(A),
00672                                   cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
00673                                   cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
00674                                   cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
00675                                   cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
00676 
00677                                   viennacl::traits::opencl_handle(B),
00678                                   cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)),
00679                                   cl_uint(viennacl::traits::stride1(B)),          cl_uint(viennacl::traits::stride2(B)),
00680                                   cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
00681                                   cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
00682 
00683                                   cl_beta,
00684                                   viennacl::traits::opencl_handle(C),
00685                                   cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)),
00686                                   cl_uint(viennacl::traits::stride1(C)),          cl_uint(viennacl::traits::stride2(C)),
00687                                   cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
00688                                   cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
00689                                   )
00690                                 );
00691         }
00692 
00693         // C = A * B, using fast kernel for NVIDIA
00694         template <typename T1, typename T2, typename T3, typename ScalarType >
00695         void prod_fast_kernel(const T1 & A,
00696                               const T2 & B,
00697                               T3 & C,
00698                               ScalarType alpha,
00699                               ScalarType beta,
00700                               std::string kernel_name)
00701         {
00702           typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type   cpu_value_type;
00703           typedef typename viennacl::result_of::orientation_functor<T1>::type   orientation_A;
00704           typedef typename viennacl::result_of::orientation_functor<T2>::type   orientation_B;
00705           typedef typename viennacl::result_of::orientation_functor<T3>::type   orientation_C;
00706 
00707           viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00708 
00709           typedef viennacl::linalg::opencl::kernels::matrix_prod<cpu_value_type, orientation_A, orientation_B, orientation_C>    KernelClass;
00710           KernelClass::init(ctx);
00711 
00712           //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl;
00713           viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name);
00714 
00715           k.global_work_size(0, viennacl::traits::size2(C) / 4); //column blocks
00716           k.global_work_size(1, viennacl::traits::size1(C) / 4); //row blocks
00717           k.local_work_size(0, 16);  //columns
00718           k.local_work_size(1, 4);   //rows
00719 
00720           cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha);
00721           cpu_value_type cl_beta  = static_cast<cpu_value_type>(beta);
00722 
00723           viennacl::ocl::enqueue(k(cl_alpha,
00724                                   viennacl::traits::opencl_handle(A),
00725                                   cl_uint(viennacl::traits::start1(A)),           cl_uint(viennacl::traits::start2(A)),
00726                                   cl_uint(viennacl::traits::stride1(A)),          cl_uint(viennacl::traits::stride2(A)),
00727                                   cl_uint(viennacl::traits::size1(A)),            cl_uint(viennacl::traits::size2(A)),
00728                                   cl_uint(viennacl::traits::internal_size1(A)),   cl_uint(viennacl::traits::internal_size2(A)),
00729 
00730                                   viennacl::traits::opencl_handle(B),
00731                                   cl_uint(viennacl::traits::start1(B)),           cl_uint(viennacl::traits::start2(B)),
00732                                   cl_uint(viennacl::traits::stride1(B)),          cl_uint(viennacl::traits::stride2(B)),
00733                                   cl_uint(viennacl::traits::size1(B)),            cl_uint(viennacl::traits::size2(B)),
00734                                   cl_uint(viennacl::traits::internal_size1(B)),   cl_uint(viennacl::traits::internal_size2(B)),
00735 
00736                                   cl_beta,
00737                                   viennacl::traits::opencl_handle(C),
00738                                   cl_uint(viennacl::traits::start1(C)),           cl_uint(viennacl::traits::start2(C)),
00739                                   cl_uint(viennacl::traits::stride1(C)),          cl_uint(viennacl::traits::stride2(C)),
00740                                   cl_uint(viennacl::traits::size1(C)),            cl_uint(viennacl::traits::size2(C)),
00741                                   cl_uint(viennacl::traits::internal_size1(C)),   cl_uint(viennacl::traits::internal_size2(C))
00742                                   )
00743                                 );
00744         }
00745 
00746         template <typename T1, typename T2, typename T3, typename ScalarType >
00747         void prod(const T1 & A,
00748                   const T2 & B,
00749                   T3 & C,
00750                   ScalarType alpha,
00751                   ScalarType beta,
00752                   std::string fast_kernel_name,
00753                   std::string slow_kernel_name)
00754         {
00755           if (   (viennacl::traits::size1(A) < 64)
00756               || (viennacl::traits::size2(A) < 64)
00757               || (viennacl::traits::size1(B) < 64)
00758               || (viennacl::traits::size2(B) < 64) )   //there is most likely not enough to compute, rendering kernel launch overhead considerable
00759           {
00760             prod_slow_kernel(A, B, C, alpha, beta, slow_kernel_name);
00761           }
00762           else if (   (viennacl::traits::size1(A) % 64 == 0)
00763                    && (viennacl::traits::size2(A) % 64 == 0)
00764                    && (viennacl::traits::size1(B) % 64 == 0)
00765                    && (viennacl::traits::size2(B) % 64 == 0) )   // allows the use of the fast NVIDIA kernel
00766           {
00767             prod_fast_kernel(A, B, C, alpha, beta, fast_kernel_name);
00768             //prod_slow_kernel(A, B, C, slow_kernel_name);
00769           }
00770           else //TODO: use four kernels
00771           {
00772             prod_slow_kernel(A, B, C, alpha, beta, slow_kernel_name);
00773           }
00774 
00775         }
00776       } // namespace detail
00777 
00778 
00784       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
00785       void prod_impl(const matrix_base<NumericT, F1> & A,
00786                      const matrix_base<NumericT, F2> & B,
00787                            matrix_base<NumericT, F3> & C,
00788                      ScalarType alpha,
00789                      ScalarType beta)
00790       {
00791         assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, B): size1(A) != size1(C)"));
00792         assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(A, B): size2(A) != size1(B)"));
00793         assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, B): size2(B) != size2(C)"));
00794 
00795         bool A_not_aligned = (A.internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.internal_size2()%matrix_base<NumericT, F1>::alignment>0);
00796         bool B_not_aligned = (B.internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.internal_size2()%matrix_base<NumericT, F2>::alignment>0);
00797         bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
00798         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
00799         /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A))
00800               && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
00801               && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
00802 
00803         if(A_not_aligned || A.start1() > 0 || A.start2() > 0 || A.stride1() > 1 || A.stride2() > 1
00804          ||B_not_aligned || B.start1() > 0 || B.start2() > 0 || B.stride1() > 1 || B.stride2() > 1
00805          ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
00806           detail::prod(A, B, C, alpha, beta, "prod16_AA", "prod_AA");
00807         else{
00808           typedef matrix_expression<const matrix_base<NumericT, F1>, const matrix_base<NumericT, F2>, op_mat_mat_prod> ProdType;
00809           viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
00810         }
00811       }
00812 
00813 
00814 
00820       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
00821       void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>,
00822                                                         const matrix_base<NumericT, F1>,
00823                                                         op_trans> & A,
00824                      const matrix_base<NumericT, F2> & B,
00825                            matrix_base<NumericT, F3> & C,
00826                      ScalarType alpha,
00827                      ScalarType beta)
00828       {
00829         //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl;
00830         //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl;
00831         assert( (viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(A) != size1(C)"));
00832         assert( (viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(trans(A), B): size1(A) != size1(B)"));
00833         assert( (viennacl::traits::size2(B)       == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(B) != size2(C)"));
00834 
00835         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
00836         /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
00837               && (viennacl::traits::handle(C) != viennacl::traits::handle(B))
00838               && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
00839 
00840         bool A_not_aligned = (A.lhs().internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.lhs().internal_size2()%matrix_base<NumericT, F1>::alignment>0);
00841         bool B_not_aligned = (B.internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.internal_size2()%matrix_base<NumericT, F2>::alignment>0);
00842         bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
00843 
00844 
00845         if(A_not_aligned || A.lhs().start1() > 0 || A.lhs().start2() > 0 || A.lhs().stride1() > 1 || A.lhs().stride2() > 1
00846          ||B_not_aligned || B.start1() > 0 || B.start2() > 0 || B.stride1() > 1 || B.stride2() > 1
00847          ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
00848           detail::prod(A.lhs(), B, C, alpha, beta, "prod16_TA", "prod_TA");
00849         else{
00850           typedef const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> LhsType;
00851           typedef matrix_expression<LhsType, const matrix_base<NumericT, F2>, op_mat_mat_prod> ProdType;
00852           viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
00853         }
00854       }
00855 
00856 
00857 
00858 
00864       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
00865       void prod_impl(const matrix_base<NumericT, F1> & A,
00866                      const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
00867                            matrix_base<NumericT, F3> & C,
00868                      ScalarType alpha,
00869                      ScalarType beta)
00870       {
00871         assert( (viennacl::traits::size1(A)       == viennacl::traits::size1(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(A) != size1(C)"));
00872         assert( (viennacl::traits::size2(A)       == viennacl::traits::size2(B.lhs())) && bool("Size mismatch in C = prod(A, trans(B)): size2(A) != size2(B)"));
00873         assert( (viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C))       && bool("Size mismatch in C = prod(A, trans(B)): size1(B) != size2(C)"));
00874 
00875         bool A_not_aligned = (A.internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.internal_size2()%matrix_base<NumericT, F1>::alignment>0);
00876         bool B_not_aligned = (B.lhs().internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.lhs().internal_size2()%matrix_base<NumericT, F2>::alignment>0);
00877         bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
00878 
00879         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
00880         /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A))
00881               && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs()))
00882               && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
00883 
00884         if(A_not_aligned || A.start1() > 0 || A.start2() > 0 || A.stride1() > 1 || A.stride2() > 1
00885          ||B_not_aligned || B.lhs().start1() > 0 || B.lhs().start2() > 0 || B.lhs().stride1() > 1 || B.lhs().stride2() > 1
00886          ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
00887           detail::prod(A, B.lhs(), C, alpha, beta, "prod16_AT", "prod_AT");
00888         else{
00889           typedef const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> RhsType;
00890           typedef matrix_expression<const matrix_base<NumericT, F1>, RhsType, op_mat_mat_prod> ProdType;
00891           viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
00892         }
00893       }
00894 
00895 
00896 
00902       template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType >
00903       void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A,
00904                      const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B,
00905                      matrix_base<NumericT, F3> & C,
00906                      ScalarType alpha,
00907                      ScalarType beta)
00908       {
00909         assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size2(A) != size1(C)"));
00910         assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(A) != size2(B)"));
00911         assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)       && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(B) != size2(C)"));
00912 
00913         bool A_not_aligned = (A.lhs().internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.lhs().internal_size2()%matrix_base<NumericT, F1>::alignment>0);
00914         bool B_not_aligned = (B.lhs().internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.lhs().internal_size2()%matrix_base<NumericT, F2>::alignment>0);
00915         bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0);
00916 
00917         // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead
00918         /*assert(  (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs()))
00919               && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs()))
00920               && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/
00921 
00922         if(A_not_aligned || A.lhs().start1() > 0 || A.lhs().start2() > 0 || A.lhs().stride1() > 1 || A.lhs().stride2() > 1
00923          ||B_not_aligned || B.lhs().start1() > 0 || B.lhs().start2() > 0 || B.lhs().stride1() > 1 || B.lhs().stride2() > 1
00924          ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1)
00925           detail::prod(A.lhs(), B.lhs(), C, alpha, beta, "prod16_TT", "prod_TT");
00926         else{
00927           typedef const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> LhsType;
00928           typedef const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> RhsType;
00929           typedef matrix_expression<LhsType, RhsType, op_mat_mat_prod> ProdType;
00930           viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C));
00931         }
00932       }
00933 
00934 
00935 
00936 
00937       //
00939       //
00940 
00941 
00954       template <typename NumericT, typename F, typename S1>
00955       void scaled_rank_1_update(matrix_base<NumericT, F> & mat1,
00956                                 S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00957                                 const vector_base<NumericT> & vec1,
00958                                 const vector_base<NumericT> & vec2)
00959       {
00960         assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)"));
00961         assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)"));
00962 
00963         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context());
00964         typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F>  KernelClass;
00965         KernelClass::init(ctx);
00966 
00967         cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00968 
00969         viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), viennacl::is_cpu_scalar<S1>::value ? "scaled_rank1_update_cpu" : "scaled_rank1_update_gpu");
00970 
00971         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1),
00972                                  cl_uint(viennacl::traits::start1(mat1)),           cl_uint(viennacl::traits::start2(mat1)),
00973                                  cl_uint(viennacl::traits::stride1(mat1)),          cl_uint(viennacl::traits::stride2(mat1)),
00974                                  cl_uint(viennacl::traits::size1(mat1)),            cl_uint(viennacl::traits::size2(mat1)),
00975                                  cl_uint(viennacl::traits::internal_size1(mat1)),   cl_uint(viennacl::traits::internal_size2(mat1)),
00976 
00977                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)),
00978                                  options_alpha,
00979 
00980                                  viennacl::traits::opencl_handle(vec1),
00981                                  cl_uint(viennacl::traits::start(vec1)),
00982                                  cl_uint(viennacl::traits::stride(vec1)),
00983                                  cl_uint(viennacl::traits::size(vec1)),
00984 
00985                                  viennacl::traits::opencl_handle(vec2),
00986                                  cl_uint(viennacl::traits::start(vec2)),
00987                                  cl_uint(viennacl::traits::stride(vec2)),
00988                                  cl_uint(viennacl::traits::size(vec2))
00989                                 )
00990                               );
00991       }
00992 
00993     } // namespace opencl
00994   } //namespace linalg
00995 } //namespace viennacl
00996 
00997 
00998 #endif