ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/opencl/vector_operations.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
00002 #define VIENNACL_LINALG_OPENCL_VECTOR_OPERATIONS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include <cmath>
00026 
00027 #include "viennacl/forwards.h"
00028 #include "viennacl/ocl/device.hpp"
00029 #include "viennacl/ocl/handle.hpp"
00030 #include "viennacl/ocl/kernel.hpp"
00031 #include "viennacl/scalar.hpp"
00032 #include "viennacl/tools/tools.hpp"
00033 #include "viennacl/linalg/opencl/common.hpp"
00034 #include "viennacl/linalg/opencl/kernels/vector.hpp"
00035 #include "viennacl/linalg/opencl/kernels/vector_element.hpp"
00036 #include "viennacl/meta/predicate.hpp"
00037 #include "viennacl/meta/enable_if.hpp"
00038 #include "viennacl/traits/size.hpp"
00039 #include "viennacl/traits/start.hpp"
00040 #include "viennacl/traits/handle.hpp"
00041 #include "viennacl/traits/stride.hpp"
00042 
00043 namespace viennacl
00044 {
00045   namespace linalg
00046   {
00047     namespace opencl
00048     {
00049       //
00050       // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
00051       //
00052 
00053 
00054       template <typename T, typename ScalarType1>
00055       void av(vector_base<T> & vec1,
00056               vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
00057       {
00058         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00059 
00060         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00061         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00062 
00063         cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00064 
00065         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(),
00066                                                    (viennacl::is_cpu_scalar<ScalarType1>::value ? "av_cpu" : "av_gpu"));
00067         k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
00068                                                     viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
00069 
00070         viennacl::ocl::packed_cl_uint size_vec1;
00071         size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
00072         size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
00073         size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
00074         size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
00075 
00076         viennacl::ocl::packed_cl_uint size_vec2;
00077         size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
00078         size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
00079         size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
00080         size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
00081 
00082 
00083         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00084                                  size_vec1,
00085 
00086                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
00087                                  options_alpha,
00088                                  viennacl::traits::opencl_handle(vec2),
00089                                  size_vec2 )
00090                               );
00091       }
00092 
00093 
00094       template <typename T, typename ScalarType1, typename ScalarType2>
00095       void avbv(vector_base<T> & vec1,
00096                 vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00097                 vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00098       {
00099         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00100         assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00101 
00102         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00103         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00104 
00105         std::string kernel_name;
00106         if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
00107           kernel_name = "avbv_cpu_cpu";
00108         else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
00109           kernel_name = "avbv_cpu_gpu";
00110         else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
00111           kernel_name = "avbv_gpu_cpu";
00112         else
00113           kernel_name = "avbv_gpu_gpu";
00114 
00115         cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00116         cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00117 
00118         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
00119         k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
00120                                                     viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
00121 
00122         viennacl::ocl::packed_cl_uint size_vec1;
00123         size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
00124         size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
00125         size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
00126         size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
00127 
00128         viennacl::ocl::packed_cl_uint size_vec2;
00129         size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
00130         size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
00131         size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
00132         size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
00133 
00134         viennacl::ocl::packed_cl_uint size_vec3;
00135         size_vec3.start  = cl_uint(viennacl::traits::start(vec3));
00136         size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
00137         size_vec3.size   = cl_uint(viennacl::traits::size(vec3));
00138         size_vec3.internal_size   = cl_uint(viennacl::traits::internal_size(vec3));
00139 
00140         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00141                                  size_vec1,
00142 
00143                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
00144                                  options_alpha,
00145                                  viennacl::traits::opencl_handle(vec2),
00146                                  size_vec2,
00147 
00148                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
00149                                  options_beta,
00150                                  viennacl::traits::opencl_handle(vec3),
00151                                  size_vec3 )
00152                               );
00153       }
00154 
00155 
00156       template <typename T, typename ScalarType1, typename ScalarType2>
00157       void avbv_v(vector_base<T> & vec1,
00158                   vector_base<T> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
00159                   vector_base<T> const & vec3, ScalarType2 const & beta,  vcl_size_t len_beta,  bool reciprocal_beta,  bool flip_sign_beta)
00160       {
00161         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00162         assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(vec3).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00163 
00164         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00165         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00166 
00167         std::string kernel_name;
00168         if (viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
00169           kernel_name = "avbv_v_cpu_cpu";
00170         else if (viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value)
00171           kernel_name = "avbv_v_cpu_gpu";
00172         else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value)
00173           kernel_name = "avbv_v_gpu_cpu";
00174         else
00175           kernel_name = "avbv_v_gpu_gpu";
00176 
00177         cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
00178         cl_uint options_beta  = detail::make_options(len_beta,  reciprocal_beta,  flip_sign_beta);
00179 
00180         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), kernel_name);
00181         k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
00182                                                     viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
00183 
00184         viennacl::ocl::packed_cl_uint size_vec1;
00185         size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
00186         size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
00187         size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
00188         size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
00189 
00190         viennacl::ocl::packed_cl_uint size_vec2;
00191         size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
00192         size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
00193         size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
00194         size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
00195 
00196         viennacl::ocl::packed_cl_uint size_vec3;
00197         size_vec3.start  = cl_uint(viennacl::traits::start(vec3));
00198         size_vec3.stride = cl_uint(viennacl::traits::stride(vec3));
00199         size_vec3.size   = cl_uint(viennacl::traits::size(vec3));
00200         size_vec3.internal_size   = cl_uint(viennacl::traits::internal_size(vec3));
00201 
00202         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00203                                  size_vec1,
00204 
00205                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(alpha)),
00206                                  options_alpha,
00207                                  viennacl::traits::opencl_handle(vec2),
00208                                  size_vec2,
00209 
00210                                  viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<T>(beta)),
00211                                  options_beta,
00212                                  viennacl::traits::opencl_handle(vec3),
00213                                  size_vec3 )
00214                               );
00215       }
00216 
00217 
00224       template <typename T>
00225       void vector_assign(vector_base<T> & vec1, const T & alpha, bool up_to_internal_size = false)
00226       {
00227         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00228         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00229 
00230         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "assign_cpu");
00231         k.global_work_size(0, std::min<vcl_size_t>(128 * k.local_work_size(),
00232                                                     viennacl::tools::align_to_multiple<vcl_size_t>(viennacl::traits::size(vec1), k.local_work_size()) ) );
00233 
00234         cl_uint size = up_to_internal_size ? cl_uint(vec1.internal_size()) : cl_uint(viennacl::traits::size(vec1));
00235         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00236                                  cl_uint(viennacl::traits::start(vec1)),
00237                                  cl_uint(viennacl::traits::stride(vec1)),
00238                                  size,
00239                                  cl_uint(vec1.internal_size()),     //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
00240                                  viennacl::traits::opencl_handle(T(alpha)) )
00241                               );
00242       }
00243 
00244 
00250       template <typename T>
00251       void vector_swap(vector_base<T> & vec1, vector_base<T> & vec2)
00252       {
00253         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00254 
00255         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00256         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00257 
00258         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "swap");
00259 
00260         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00261                                  cl_uint(viennacl::traits::start(vec1)),
00262                                  cl_uint(viennacl::traits::stride(vec1)),
00263                                  cl_uint(viennacl::traits::size(vec1)),
00264                                  viennacl::traits::opencl_handle(vec2),
00265                                  cl_uint(viennacl::traits::start(vec2)),
00266                                  cl_uint(viennacl::traits::stride(vec2)),
00267                                  cl_uint(viennacl::traits::size(vec2)))
00268                               );
00269       }
00270 
00272 
00278       template <typename T, typename OP>
00279       void element_op(vector_base<T> & vec1,
00280                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_binary<OP> > const & proxy)
00281       {
00282         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00283         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00284 
00285         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00286         viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
00287 
00288         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), "element_op");
00289 
00290         cl_uint op_type = 2; //0: product, 1: division, 2: power
00291         if (viennacl::is_division<OP>::value)
00292           op_type = 1;
00293         else if (viennacl::is_product<OP>::value)
00294           op_type = 0;
00295 
00296         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00297                                  cl_uint(viennacl::traits::start(vec1)),
00298                                  cl_uint(viennacl::traits::stride(vec1)),
00299                                  cl_uint(viennacl::traits::size(vec1)),
00300 
00301                                  viennacl::traits::opencl_handle(proxy.lhs()),
00302                                  cl_uint(viennacl::traits::start(proxy.lhs())),
00303                                  cl_uint(viennacl::traits::stride(proxy.lhs())),
00304 
00305                                  viennacl::traits::opencl_handle(proxy.rhs()),
00306                                  cl_uint(viennacl::traits::start(proxy.rhs())),
00307                                  cl_uint(viennacl::traits::stride(proxy.rhs())),
00308 
00309                                  op_type)
00310                               );
00311       }
00312 
00314 
00320       template <typename T, typename OP>
00321       void element_op(vector_base<T> & vec1,
00322                       vector_expression<const vector_base<T>, const vector_base<T>, op_element_unary<OP> > const & proxy)
00323       {
00324         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00325         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00326 
00327         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00328         viennacl::linalg::opencl::kernels::vector_element<T>::init(ctx);
00329 
00330         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_element<T>::program_name(), detail::op_to_string(OP()) + "_assign");
00331 
00332         viennacl::ocl::packed_cl_uint size_vec1;
00333         size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
00334         size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
00335         size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
00336         size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
00337 
00338         viennacl::ocl::packed_cl_uint size_vec2;
00339         size_vec2.start  = cl_uint(viennacl::traits::start(proxy.lhs()));
00340         size_vec2.stride = cl_uint(viennacl::traits::stride(proxy.lhs()));
00341         size_vec2.size   = cl_uint(viennacl::traits::size(proxy.lhs()));
00342         size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(proxy.lhs()));
00343 
00344         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00345                                  size_vec1,
00346                                  viennacl::traits::opencl_handle(proxy.lhs()),
00347                                  size_vec2)
00348                               );
00349       }
00350 
00352 
00359       template <typename T>
00360       void inner_prod_impl(vector_base<T> const & vec1,
00361                            vector_base<T> const & vec2,
00362                            vector_base<T> & partial_result)
00363       {
00364         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00365         assert(viennacl::traits::opencl_handle(vec2).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00366 
00367         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00368         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00369 
00370         assert( (viennacl::traits::size(vec1) == viennacl::traits::size(vec2))
00371               && bool("Incompatible vector sizes in inner_prod_impl()!"));
00372 
00373         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "inner_prod1");
00374 
00375         assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in inner_prod_impl()") );
00376 
00377         viennacl::ocl::packed_cl_uint size_vec1;
00378         size_vec1.start  = cl_uint(viennacl::traits::start(vec1));
00379         size_vec1.stride = cl_uint(viennacl::traits::stride(vec1));
00380         size_vec1.size   = cl_uint(viennacl::traits::size(vec1));
00381         size_vec1.internal_size   = cl_uint(viennacl::traits::internal_size(vec1));
00382 
00383         viennacl::ocl::packed_cl_uint size_vec2;
00384         size_vec2.start  = cl_uint(viennacl::traits::start(vec2));
00385         size_vec2.stride = cl_uint(viennacl::traits::stride(vec2));
00386         size_vec2.size   = cl_uint(viennacl::traits::size(vec2));
00387         size_vec2.internal_size   = cl_uint(viennacl::traits::internal_size(vec2));
00388 
00389         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00390                                  size_vec1,
00391                                  viennacl::traits::opencl_handle(vec2),
00392                                  size_vec2,
00393                                  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
00394                                  viennacl::traits::opencl_handle(partial_result)
00395                                 )
00396                               );
00397       }
00398 
00399 
00400       //implementation of inner product:
00401       //namespace {
00408       template <typename T>
00409       void inner_prod_impl(vector_base<T> const & vec1,
00410                            vector_base<T> const & vec2,
00411                            scalar<T> & result)
00412       {
00413         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00414         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00415 
00416         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00417 
00418         vcl_size_t work_groups = 128;
00419         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
00420         temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
00421 
00422         // Step 1: Compute partial inner products for each work group:
00423         inner_prod_impl(vec1, vec2, temp);
00424 
00425         // Step 2: Sum partial results:
00426         viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
00427 
00428         ksum.local_work_size(0, work_groups);
00429         ksum.global_work_size(0, work_groups);
00430         viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
00431                                     cl_uint(viennacl::traits::start(temp)),
00432                                     cl_uint(viennacl::traits::stride(temp)),
00433                                     cl_uint(viennacl::traits::size(temp)),
00434                                     cl_uint(1),
00435                                     viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
00436                                     viennacl::traits::opencl_handle(result) )
00437                               );
00438       }
00439 
00440       namespace detail
00441       {
00442         template <typename ScalarT>
00443         viennacl::ocl::packed_cl_uint make_layout(vector_base<ScalarT> const & vec)
00444         {
00445           viennacl::ocl::packed_cl_uint ret;
00446           ret.start           = cl_uint(viennacl::traits::start(vec));
00447           ret.stride          = cl_uint(viennacl::traits::stride(vec));
00448           ret.size            = cl_uint(viennacl::traits::size(vec));
00449           ret.internal_size   = cl_uint(viennacl::traits::internal_size(vec));
00450           return ret;
00451         }
00452       }
00453 
00460       template <typename T>
00461       void inner_prod_impl(vector_base<T> const & x,
00462                            vector_tuple<T> const & vec_tuple,
00463                            vector_base<T> & result)
00464       {
00465         assert(viennacl::traits::opencl_handle(x).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00466 
00467         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(x).context());
00468         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00469         viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::init(ctx);
00470 
00471         vcl_size_t work_groups = 128;
00472 
00473         viennacl::vector<T> temp(work_groups, viennacl::traits::context(x));
00474         temp.resize(8 * work_groups, ctx); // bring default-constructed vectors to the correct size:
00475 
00476         viennacl::ocl::packed_cl_uint layout_x = detail::make_layout(x);
00477 
00478         viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "sum_inner_prod");
00479         viennacl::ocl::kernel & inner_prod_kernel_1 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "inner_prod1");
00480         viennacl::ocl::kernel & inner_prod_kernel_2 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod2");
00481         viennacl::ocl::kernel & inner_prod_kernel_3 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod3");
00482         viennacl::ocl::kernel & inner_prod_kernel_4 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod4");
00483         viennacl::ocl::kernel & inner_prod_kernel_8 = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector_multi_inner_prod<T>::program_name(), "inner_prod8");
00484 
00485         vcl_size_t current_index = 0;
00486         while (current_index < vec_tuple.const_size())
00487         {
00488           switch (vec_tuple.const_size() - current_index)
00489           {
00490             case 7:
00491             case 6:
00492             case 5:
00493             case 4:
00494             {
00495               vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
00496               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
00497               vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
00498               vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
00499               viennacl::ocl::enqueue(inner_prod_kernel_4( viennacl::traits::opencl_handle(x), layout_x,
00500                                                          viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
00501                                                          viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
00502                                                          viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
00503                                                          viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
00504                                                          viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 4 * inner_prod_kernel_4.local_work_size()),
00505                                                          viennacl::traits::opencl_handle(temp)
00506                                                         ) );
00507 
00508               ksum.local_work_size(0, work_groups);
00509               ksum.global_work_size(0, 4 * work_groups);
00510               viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
00511                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 4 * ksum.local_work_size()),
00512                                           viennacl::traits::opencl_handle(result),
00513                                           cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
00514                                           cl_uint(viennacl::traits::stride(result))
00515                                           )
00516                                     );
00517             }
00518               current_index += 4;
00519               break;
00520 
00521             case 3:
00522             {
00523               vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
00524               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
00525               vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
00526               viennacl::ocl::enqueue(inner_prod_kernel_3( viennacl::traits::opencl_handle(x), layout_x,
00527                                                           viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
00528                                                           viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
00529                                                           viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
00530                                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 3 * inner_prod_kernel_3.local_work_size()),
00531                                                           viennacl::traits::opencl_handle(temp)
00532                                                          ) );
00533 
00534               ksum.local_work_size(0, work_groups);
00535               ksum.global_work_size(0, 3 * work_groups);
00536               viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
00537                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 3 * ksum.local_work_size()),
00538                                           viennacl::traits::opencl_handle(result),
00539                                           cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
00540                                           cl_uint(viennacl::traits::stride(result))
00541                                           )
00542                                     );
00543             }
00544               current_index += 3;
00545               break;
00546 
00547             case 2:
00548             {
00549               vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
00550               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
00551               viennacl::ocl::enqueue(inner_prod_kernel_2( viennacl::traits::opencl_handle(x), layout_x,
00552                                                           viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
00553                                                           viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
00554                                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 2 * inner_prod_kernel_2.local_work_size()),
00555                                                           viennacl::traits::opencl_handle(temp)
00556                                                         ) );
00557 
00558               ksum.local_work_size(0, work_groups);
00559               ksum.global_work_size(0, 2 * work_groups);
00560               viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
00561                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 2 * ksum.local_work_size()),
00562                                           viennacl::traits::opencl_handle(result),
00563                                           cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
00564                                           cl_uint(viennacl::traits::stride(result))
00565                                           )
00566                                     );
00567             }
00568               current_index += 2;
00569               break;
00570 
00571             case 1:
00572             {
00573               vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
00574               viennacl::ocl::enqueue(inner_prod_kernel_1( viennacl::traits::opencl_handle(x), layout_x,
00575                                                           viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
00576                                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 1 * inner_prod_kernel_1.local_work_size()),
00577                                                           viennacl::traits::opencl_handle(temp)
00578                                                         ) );
00579 
00580               ksum.local_work_size(0, work_groups);
00581               ksum.global_work_size(0, 1 * work_groups);
00582               viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
00583                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 1 * ksum.local_work_size()),
00584                                           viennacl::traits::opencl_handle(result),
00585                                           cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
00586                                           cl_uint(viennacl::traits::stride(result))
00587                                           )
00588                                     );
00589             }
00590               current_index += 1;
00591               break;
00592 
00593             default: //8 or more vectors
00594             {
00595               vector_base<T> const & y0 = vec_tuple.const_at(current_index    );
00596               vector_base<T> const & y1 = vec_tuple.const_at(current_index + 1);
00597               vector_base<T> const & y2 = vec_tuple.const_at(current_index + 2);
00598               vector_base<T> const & y3 = vec_tuple.const_at(current_index + 3);
00599               vector_base<T> const & y4 = vec_tuple.const_at(current_index + 4);
00600               vector_base<T> const & y5 = vec_tuple.const_at(current_index + 5);
00601               vector_base<T> const & y6 = vec_tuple.const_at(current_index + 6);
00602               vector_base<T> const & y7 = vec_tuple.const_at(current_index + 7);
00603               viennacl::ocl::enqueue(inner_prod_kernel_8( viennacl::traits::opencl_handle(x), layout_x,
00604                                                           viennacl::traits::opencl_handle(y0), detail::make_layout(y0),
00605                                                           viennacl::traits::opencl_handle(y1), detail::make_layout(y1),
00606                                                           viennacl::traits::opencl_handle(y2), detail::make_layout(y2),
00607                                                           viennacl::traits::opencl_handle(y3), detail::make_layout(y3),
00608                                                           viennacl::traits::opencl_handle(y4), detail::make_layout(y4),
00609                                                           viennacl::traits::opencl_handle(y5), detail::make_layout(y5),
00610                                                           viennacl::traits::opencl_handle(y6), detail::make_layout(y6),
00611                                                           viennacl::traits::opencl_handle(y7), detail::make_layout(y7),
00612                                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 8 * inner_prod_kernel_8.local_work_size()),
00613                                                           viennacl::traits::opencl_handle(temp)
00614                                                         ) );
00615 
00616               ksum.local_work_size(0, work_groups);
00617               ksum.global_work_size(0, 8 * work_groups);
00618               viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
00619                                           viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * 8 * ksum.local_work_size()),
00620                                           viennacl::traits::opencl_handle(result),
00621                                           cl_uint(viennacl::traits::start(result) + current_index * viennacl::traits::stride(result)),
00622                                           cl_uint(viennacl::traits::stride(result))
00623                                           )
00624                                     );
00625             }
00626               current_index += 8;
00627               break;
00628           }
00629         }
00630 
00631       }
00632 
00633 
00634       //implementation of inner product:
00635       //namespace {
00642       template <typename T>
00643       void inner_prod_cpu(vector_base<T> const & vec1,
00644                           vector_base<T> const & vec2,
00645                           T & result)
00646       {
00647         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00648 
00649         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00650 
00651         vcl_size_t work_groups = 128;
00652         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec1));
00653         temp.resize(work_groups, ctx); // bring default-constructed vectors to the correct size:
00654 
00655         // Step 1: Compute partial inner products for each work group:
00656         inner_prod_impl(vec1, vec2, temp);
00657 
00658         // Step 2: Sum partial results:
00659 
00660         // Now copy partial results from GPU back to CPU and run reduction there:
00661         std::vector<T> temp_cpu(work_groups);
00662         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
00663 
00664         result = 0;
00665         for (typename std::vector<T>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
00666           result += *it;
00667       }
00668 
00669 
00671 
00678       template <typename T>
00679       void norm_reduction_impl(vector_base<T> const & vec,
00680                                vector_base<T> & partial_result,
00681                                 cl_uint norm_id)
00682       {
00683         assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(partial_result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00684 
00685         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
00686         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00687 
00688         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "norm");
00689 
00690         assert( (k.global_work_size() / k.local_work_size() <= partial_result.size()) && bool("Size mismatch for partial reduction in norm_reduction_impl()") );
00691 
00692         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
00693                                  cl_uint(viennacl::traits::start(vec)),
00694                                  cl_uint(viennacl::traits::stride(vec)),
00695                                  cl_uint(viennacl::traits::size(vec)),
00696                                  cl_uint(norm_id),
00697                                  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
00698                                  viennacl::traits::opencl_handle(partial_result) )
00699                               );
00700       }
00701 
00702 
00704 
00710       template <typename T>
00711       void norm_1_impl(vector_base<T> const & vec,
00712                        scalar<T> & result)
00713       {
00714         assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00715 
00716         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
00717 
00718         vcl_size_t work_groups = 128;
00719         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
00720 
00721         // Step 1: Compute the partial work group results
00722         norm_reduction_impl(vec, temp, 1);
00723 
00724         // Step 2: Compute the partial reduction using OpenCL
00725         viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
00726 
00727         ksum.local_work_size(0, work_groups);
00728         ksum.global_work_size(0, work_groups);
00729         viennacl::ocl::enqueue(ksum(viennacl::traits::opencl_handle(temp),
00730                                     cl_uint(viennacl::traits::start(temp)),
00731                                     cl_uint(viennacl::traits::stride(temp)),
00732                                     cl_uint(viennacl::traits::size(temp)),
00733                                     cl_uint(1),
00734                                     viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
00735                                     result)
00736                               );
00737       }
00738 
00744       template <typename T>
00745       void norm_1_cpu(vector_base<T> const & vec,
00746                       T & result)
00747       {
00748         vcl_size_t work_groups = 128;
00749         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
00750 
00751         // Step 1: Compute the partial work group results
00752         norm_reduction_impl(vec, temp, 1);
00753 
00754         // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
00755         typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
00756 
00757         CPUVectorType temp_cpu(work_groups);
00758         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
00759 
00760         result = 0;
00761         for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
00762           result += static_cast<T>(*it);
00763       }
00764 
00765 
00766 
00768 
00769 
00775       template <typename T>
00776       void norm_2_impl(vector_base<T> const & vec,
00777                        scalar<T> & result)
00778       {
00779         assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00780 
00781         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
00782 
00783         vcl_size_t work_groups = 128;
00784         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
00785 
00786         // Step 1: Compute the partial work group results
00787         norm_reduction_impl(vec, temp, 2);
00788 
00789         // Step 2: Reduction via OpenCL
00790         viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
00791 
00792         ksum.local_work_size(0, work_groups);
00793         ksum.global_work_size(0, work_groups);
00794         viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
00795                                       cl_uint(viennacl::traits::start(temp)),
00796                                       cl_uint(viennacl::traits::stride(temp)),
00797                                       cl_uint(viennacl::traits::size(temp)),
00798                                       cl_uint(2),
00799                                       viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
00800                                       result)
00801                               );
00802       }
00803 
00809       template <typename T>
00810       void norm_2_cpu(vector_base<T> const & vec,
00811                       T & result)
00812       {
00813         vcl_size_t work_groups = 128;
00814         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
00815 
00816         // Step 1: Compute the partial work group results
00817         norm_reduction_impl(vec, temp, 2);
00818 
00819         // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
00820         typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
00821 
00822         CPUVectorType temp_cpu(work_groups);
00823         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
00824 
00825         result = 0;
00826         for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
00827           result += static_cast<T>(*it);
00828         result = std::sqrt(result);
00829       }
00830 
00831 
00832 
00834 
00840       template <typename T>
00841       void norm_inf_impl(vector_base<T> const & vec,
00842                          scalar<T> & result)
00843       {
00844         assert(viennacl::traits::opencl_handle(vec).context() == viennacl::traits::opencl_handle(result).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00845 
00846         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
00847 
00848         vcl_size_t work_groups = 128;
00849         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
00850 
00851         // Step 1: Compute the partial work group results
00852         norm_reduction_impl(vec, temp, 0);
00853 
00854         //part 2: parallel reduction of reduced kernel:
00855         viennacl::ocl::kernel & ksum = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "sum");
00856         ksum.local_work_size(0, work_groups);
00857         ksum.global_work_size(0, work_groups);
00858 
00859         viennacl::ocl::enqueue( ksum(viennacl::traits::opencl_handle(temp),
00860                                      cl_uint(viennacl::traits::start(temp)),
00861                                      cl_uint(viennacl::traits::stride(temp)),
00862                                      cl_uint(viennacl::traits::size(temp)),
00863                                      cl_uint(0),
00864                                      viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * ksum.local_work_size()),
00865                                      result)
00866                               );
00867       }
00868 
00874       template <typename T>
00875       void norm_inf_cpu(vector_base<T> const & vec,
00876                         T & result)
00877       {
00878         vcl_size_t work_groups = 128;
00879         viennacl::vector<T> temp(work_groups, viennacl::traits::context(vec));
00880 
00881         // Step 1: Compute the partial work group results
00882         norm_reduction_impl(vec, temp, 0);
00883 
00884         // Step 2: Now copy partial results from GPU back to CPU and run reduction there:
00885         typedef std::vector<typename viennacl::result_of::cl_type<T>::type>  CPUVectorType;
00886 
00887         CPUVectorType temp_cpu(work_groups);
00888         viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
00889 
00890         result = 0;
00891         for (typename CPUVectorType::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
00892           result = std::max(result, static_cast<T>(*it));
00893       }
00894 
00895 
00897 
00898       //This function should return a CPU scalar, otherwise statements like
00899       // vcl_rhs[index_norm_inf(vcl_rhs)]
00900       // are ambiguous
00906       template <typename T>
00907       cl_uint index_norm_inf(vector_base<T> const & vec)
00908       {
00909         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context());
00910         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00911 
00912         viennacl::ocl::handle<cl_mem> h = ctx.create_memory(CL_MEM_READ_WRITE, sizeof(cl_uint));
00913 
00914         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "index_norm_inf");
00915         //cl_uint size = static_cast<cl_uint>(vcl_vec.internal_size());
00916 
00917         //TODO: Use multi-group kernel for large vector sizes
00918 
00919         k.global_work_size(0, k.local_work_size());
00920         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec),
00921                                  cl_uint(viennacl::traits::start(vec)),
00922                                  cl_uint(viennacl::traits::stride(vec)),
00923                                  cl_uint(viennacl::traits::size(vec)),
00924                                  viennacl::ocl::local_mem(sizeof(typename viennacl::result_of::cl_type<T>::type) * k.local_work_size()),
00925                                  viennacl::ocl::local_mem(sizeof(cl_uint) * k.local_work_size()), h));
00926 
00927         //read value:
00928         cl_uint result;
00929         cl_int err = clEnqueueReadBuffer(ctx.get_queue().handle().get(), h.get(), CL_TRUE, 0, sizeof(cl_uint), &result, 0, NULL, NULL);
00930         VIENNACL_ERR_CHECK(err);
00931         return result;
00932       }
00933 
00934       //TODO: Special case vec1 == vec2 allows improvement!!
00944       template <typename T>
00945       void plane_rotation(vector_base<T> & vec1,
00946                           vector_base<T> & vec2,
00947                           T alpha, T beta)
00948       {
00949         assert(viennacl::traits::opencl_handle(vec1).context() == viennacl::traits::opencl_handle(vec2).context() && bool("Operands do not reside in the same OpenCL context. Automatic migration not yet supported!"));
00950 
00951         viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec1).context());
00952         viennacl::linalg::opencl::kernels::vector<T>::init(ctx);
00953 
00954         assert(viennacl::traits::size(vec1) == viennacl::traits::size(vec2));
00955         viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::vector<T>::program_name(), "plane_rotation");
00956 
00957         viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(vec1),
00958                                  cl_uint(viennacl::traits::start(vec1)),
00959                                  cl_uint(viennacl::traits::stride(vec1)),
00960                                  cl_uint(viennacl::traits::size(vec1)),
00961                                  viennacl::traits::opencl_handle(vec2),
00962                                  cl_uint(viennacl::traits::start(vec2)),
00963                                  cl_uint(viennacl::traits::stride(vec2)),
00964                                  cl_uint(viennacl::traits::size(vec2)),
00965                                  viennacl::traits::opencl_handle(alpha),
00966                                  viennacl::traits::opencl_handle(beta))
00967                               );
00968       }
00969 
00970     } //namespace opencl
00971   } //namespace linalg
00972 } //namespace viennacl
00973 
00974 
00975 #endif