ViennaCL - The Vienna Computing Library
1.5.2
|
00001 #ifndef VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_ 00002 #define VIENNACL_LINALG_OPENCL_MATRIX_OPERATIONS_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00025 #include "viennacl/forwards.h" 00026 #include "viennacl/ocl/device.hpp" 00027 #include "viennacl/ocl/handle.hpp" 00028 #include "viennacl/ocl/kernel.hpp" 00029 #include "viennacl/scalar.hpp" 00030 #include "viennacl/vector.hpp" 00031 #include "viennacl/vector_proxy.hpp" 00032 #include "viennacl/tools/tools.hpp" 00033 #include "viennacl/meta/enable_if.hpp" 00034 #include "viennacl/meta/predicate.hpp" 00035 #include "viennacl/meta/result_of.hpp" 00036 00037 #include "viennacl/scheduler/forwards.h" 00038 00039 #include "viennacl/generator/generate.hpp" 00040 00041 #include "viennacl/traits/size.hpp" 00042 #include "viennacl/traits/start.hpp" 00043 #include "viennacl/traits/handle.hpp" 00044 #include "viennacl/traits/stride.hpp" 00045 00046 #include "viennacl/linalg/opencl/common.hpp" 00047 00048 #include "viennacl/linalg/opencl/kernels/matrix.hpp" 00049 #include "viennacl/linalg/opencl/kernels/matrix_element.hpp" 00050 00051 #include "viennacl/linalg/opencl/kernels/matrix_prod.hpp" 00052 00053 00054 namespace viennacl 00055 { 00056 namespace linalg 00057 { 00058 namespace opencl 00059 { 00060 // 00061 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here! 00062 // 00063 00064 template <typename NumericT, typename F, 00065 typename ScalarType1> 00066 void am(matrix_base<NumericT, F> & mat1, 00067 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha) 00068 { 00069 typedef NumericT value_type; 00070 00071 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context()); 00072 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00073 KernelClass::init(ctx); 00074 00075 cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00076 00077 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), 00078 (viennacl::is_cpu_scalar<ScalarType1>::value ? "am_cpu" : "am_gpu")); 00079 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1), 00080 cl_uint(viennacl::traits::start1(mat1)), cl_uint(viennacl::traits::start2(mat1)), 00081 cl_uint(viennacl::traits::stride1(mat1)), cl_uint(viennacl::traits::stride2(mat1)), 00082 cl_uint(viennacl::traits::size1(mat1)), cl_uint(viennacl::traits::size2(mat1)), 00083 cl_uint(viennacl::traits::internal_size1(mat1)), cl_uint(viennacl::traits::internal_size2(mat1)), 00084 00085 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)), 00086 options_alpha, 00087 viennacl::traits::opencl_handle(mat2), 00088 cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)), 00089 cl_uint(viennacl::traits::stride1(mat2)), cl_uint(viennacl::traits::stride2(mat2)), 00090 cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2)) 00091 ) 00092 ); 00093 } 00094 00095 00096 template <typename NumericT, typename F, 00097 typename ScalarType1, typename ScalarType2> 00098 void ambm(matrix_base<NumericT, F> & mat1, 00099 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 00100 matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) 00101 { 00102 typedef NumericT value_type; 00103 00104 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context()); 00105 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00106 KernelClass::init(ctx); 00107 00108 std::string kernel_name; 00109 if ( viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value) 00110 kernel_name = "ambm_cpu_cpu"; 00111 else if ( viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value) 00112 kernel_name = "ambm_cpu_gpu"; 00113 else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value) 00114 kernel_name = "ambm_gpu_cpu"; 00115 else 00116 kernel_name = "ambm_gpu_gpu"; 00117 00118 cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00119 cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); 00120 00121 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name); 00122 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1), 00123 cl_uint(viennacl::traits::start1(mat1)), cl_uint(viennacl::traits::start2(mat1)), 00124 cl_uint(viennacl::traits::stride1(mat1)), cl_uint(viennacl::traits::stride2(mat1)), 00125 cl_uint(viennacl::traits::size1(mat1)), cl_uint(viennacl::traits::size2(mat1)), 00126 cl_uint(viennacl::traits::internal_size1(mat1)), cl_uint(viennacl::traits::internal_size2(mat1)), 00127 00128 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)), 00129 options_alpha, 00130 viennacl::traits::opencl_handle(mat2), 00131 cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)), 00132 cl_uint(viennacl::traits::stride1(mat2)), cl_uint(viennacl::traits::stride2(mat2)), 00133 cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2)), 00134 00135 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)), 00136 options_beta, 00137 viennacl::traits::opencl_handle(mat3), 00138 cl_uint(viennacl::traits::start1(mat3)), cl_uint(viennacl::traits::start2(mat3)), 00139 cl_uint(viennacl::traits::stride1(mat3)), cl_uint(viennacl::traits::stride2(mat3)), 00140 cl_uint(viennacl::traits::internal_size1(mat3)), cl_uint(viennacl::traits::internal_size2(mat3)) 00141 ) 00142 ); 00143 } 00144 00145 00146 template <typename NumericT, typename F, 00147 typename ScalarType1, typename ScalarType2> 00148 void ambm_m(matrix_base<NumericT, F> & mat1, 00149 matrix_base<NumericT, F> const & mat2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 00150 matrix_base<NumericT, F> const & mat3, ScalarType2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta) 00151 { 00152 typedef NumericT value_type; 00153 00154 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context()); 00155 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00156 KernelClass::init(ctx); 00157 00158 std::string kernel_name; 00159 if ( viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value) 00160 kernel_name = "ambm_m_cpu_cpu"; 00161 else if ( viennacl::is_cpu_scalar<ScalarType1>::value && !viennacl::is_cpu_scalar<ScalarType2>::value) 00162 kernel_name = "ambm_m_cpu_gpu"; 00163 else if (!viennacl::is_cpu_scalar<ScalarType1>::value && viennacl::is_cpu_scalar<ScalarType2>::value) 00164 kernel_name = "ambm_m_gpu_cpu"; 00165 else 00166 kernel_name = "ambm_m_gpu_gpu"; 00167 00168 cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00169 cl_uint options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta); 00170 00171 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name); 00172 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1), 00173 cl_uint(viennacl::traits::start1(mat1)), cl_uint(viennacl::traits::start2(mat1)), 00174 cl_uint(viennacl::traits::stride1(mat1)), cl_uint(viennacl::traits::stride2(mat1)), 00175 cl_uint(viennacl::traits::size1(mat1)), cl_uint(viennacl::traits::size2(mat1)), 00176 cl_uint(viennacl::traits::internal_size1(mat1)), cl_uint(viennacl::traits::internal_size2(mat1)), 00177 00178 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)), 00179 options_alpha, 00180 viennacl::traits::opencl_handle(mat2), 00181 cl_uint(viennacl::traits::start1(mat2)), cl_uint(viennacl::traits::start2(mat2)), 00182 cl_uint(viennacl::traits::stride1(mat2)), cl_uint(viennacl::traits::stride2(mat2)), 00183 cl_uint(viennacl::traits::internal_size1(mat2)), cl_uint(viennacl::traits::internal_size2(mat2)), 00184 00185 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(beta)), 00186 options_beta, 00187 viennacl::traits::opencl_handle(mat3), 00188 cl_uint(viennacl::traits::start1(mat3)), cl_uint(viennacl::traits::start2(mat3)), 00189 cl_uint(viennacl::traits::stride1(mat3)), cl_uint(viennacl::traits::stride2(mat3)), 00190 cl_uint(viennacl::traits::internal_size1(mat3)), cl_uint(viennacl::traits::internal_size2(mat3)) 00191 ) 00192 ); 00193 } 00194 00195 00196 00197 template <typename NumericT, typename F> 00198 void matrix_assign(matrix_base<NumericT, F> & mat, NumericT s, bool clear = false) 00199 { 00200 typedef NumericT value_type; 00201 00202 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context()); 00203 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00204 KernelClass::init(ctx); 00205 00206 value_type alpha = static_cast<value_type>(s); 00207 00208 cl_uint s1 = clear ? cl_uint(viennacl::traits::internal_size1(mat)) : cl_uint(viennacl::traits::size1(mat)); 00209 cl_uint s2 = clear ? cl_uint(viennacl::traits::internal_size2(mat)) : cl_uint(viennacl::traits::size2(mat)); 00210 00211 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "assign_cpu"); 00212 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat), 00213 cl_uint(viennacl::traits::start1(mat)), cl_uint(viennacl::traits::start2(mat)), 00214 cl_uint(viennacl::traits::stride1(mat)), cl_uint(viennacl::traits::stride2(mat)), 00215 s1, s2, 00216 cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)), 00217 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)) 00218 ) 00219 ); 00220 } 00221 00222 template <typename NumericT, typename F> 00223 void matrix_diagonal_assign(matrix_base<NumericT, F> & mat, NumericT s) 00224 { 00225 typedef NumericT value_type; 00226 00227 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context()); 00228 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00229 KernelClass::init(ctx); 00230 00231 value_type alpha = static_cast<value_type>(s); 00232 00233 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "diagonal_assign_cpu"); 00234 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat), 00235 cl_uint(viennacl::traits::start1(mat)), cl_uint(viennacl::traits::start2(mat)), 00236 cl_uint(viennacl::traits::stride1(mat)), cl_uint(viennacl::traits::stride2(mat)), 00237 cl_uint(viennacl::traits::size1(mat)), cl_uint(viennacl::traits::size2(mat)), 00238 cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)), 00239 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<value_type>(alpha)) 00240 ) 00241 ); 00242 } 00243 00244 template <typename NumericT, typename F> 00245 void matrix_diag_from_vector(const vector_base<NumericT> & vec, int k, matrix_base<NumericT, F> & mat) 00246 { 00247 // Step 1: set everything to zero 00248 matrix_assign(mat, NumericT(0)); 00249 00250 // Step 2: set the diagonal: 00251 00252 // reuse vector ambm kernel for assigning the elements: 00253 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context()); 00254 typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass; 00255 KernelClass::init(ctx); 00256 00257 cl_uint options_alpha = 0; 00258 viennacl::ocl::packed_cl_uint size_mat; 00259 if (viennacl::is_row_major<F>::value) 00260 { 00261 vcl_size_t first_row_index = 0; 00262 vcl_size_t first_col_index = 0; 00263 if (k < 0) 00264 first_row_index = vcl_size_t(-k); 00265 else 00266 first_col_index = vcl_size_t(k); 00267 size_mat.start = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) 00268 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)); 00269 size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat)); 00270 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00271 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00272 } 00273 else 00274 { 00275 vcl_size_t first_row_index = 0; 00276 vcl_size_t first_col_index = 0; 00277 if (k < 0) 00278 first_row_index = vcl_size_t(-k); 00279 else 00280 first_col_index = vcl_size_t(k); 00281 size_mat.start = cl_uint( viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat) 00282 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat)); 00283 size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat)); 00284 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00285 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00286 } 00287 00288 viennacl::ocl::packed_cl_uint size_vec; 00289 size_vec.start = cl_uint(viennacl::traits::start(vec)); 00290 size_vec.stride = cl_uint(viennacl::traits::stride(vec)); 00291 size_vec.size = cl_uint(viennacl::traits::size(vec)); 00292 size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00293 00294 viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu"); 00295 viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(mat), 00296 size_mat, 00297 00298 viennacl::traits::opencl_handle(NumericT(1)), 00299 options_alpha, 00300 viennacl::traits::opencl_handle(vec), 00301 size_vec) 00302 ); 00303 } 00304 00305 template <typename NumericT, typename F> 00306 void matrix_diag_to_vector(const matrix_base<NumericT, F> & mat, int k, vector_base<NumericT> & vec) 00307 { 00308 // reuse vector ambm kernel for assigning the elements: 00309 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context()); 00310 typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass; 00311 KernelClass::init(ctx); 00312 00313 cl_uint options_alpha = 0; 00314 viennacl::ocl::packed_cl_uint size_mat; 00315 if (viennacl::is_row_major<F>::value) 00316 { 00317 vcl_size_t first_row_index = 0; 00318 vcl_size_t first_col_index = 0; 00319 if (k < 0) 00320 first_row_index = vcl_size_t(-k); 00321 else 00322 first_col_index = vcl_size_t(k); 00323 size_mat.start = cl_uint( (viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) 00324 + viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)); 00325 size_mat.stride = cl_uint(viennacl::traits::stride1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::stride2(mat)); 00326 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00327 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00328 } 00329 else 00330 { 00331 vcl_size_t first_row_index = 0; 00332 vcl_size_t first_col_index = 0; 00333 if (k < 0) 00334 first_row_index = vcl_size_t(-k); 00335 else 00336 first_col_index = vcl_size_t(k); 00337 size_mat.start = cl_uint( viennacl::traits::start1(mat) + first_row_index * viennacl::traits::stride1(mat) 00338 + (viennacl::traits::start2(mat) + first_col_index * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat)); 00339 size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat) + viennacl::traits::stride1(mat)); 00340 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00341 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00342 } 00343 00344 viennacl::ocl::packed_cl_uint size_vec; 00345 size_vec.start = cl_uint(viennacl::traits::start(vec)); 00346 size_vec.stride = cl_uint(viennacl::traits::stride(vec)); 00347 size_vec.size = cl_uint(viennacl::traits::size(vec)); 00348 size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00349 00350 00351 viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu"); 00352 viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec), 00353 size_vec, 00354 00355 viennacl::traits::opencl_handle(NumericT(1)), 00356 options_alpha, 00357 viennacl::traits::opencl_handle(mat), 00358 size_mat) 00359 ); 00360 } 00361 00362 template <typename NumericT, typename F> 00363 void matrix_row(const matrix_base<NumericT, F> & mat, unsigned int i, vector_base<NumericT> & vec) 00364 { 00365 // reuse vector ambm kernel for assigning the elements: 00366 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context()); 00367 typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass; 00368 KernelClass::init(ctx); 00369 00370 cl_uint options_alpha = 0; 00371 viennacl::ocl::packed_cl_uint size_mat; 00372 if (viennacl::is_row_major<F>::value) 00373 { 00374 size_mat.start = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat)); 00375 size_mat.stride = cl_uint(viennacl::traits::stride2(mat)); 00376 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00377 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00378 } 00379 else 00380 { 00381 size_mat.start = cl_uint((viennacl::traits::start1(mat) + i * viennacl::traits::stride1(mat)) + viennacl::traits::start2(mat) * viennacl::traits::internal_size1(mat)); 00382 size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size1(mat)); 00383 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00384 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00385 } 00386 00387 viennacl::ocl::packed_cl_uint size_vec; 00388 size_vec.start = cl_uint(viennacl::traits::start(vec)); 00389 size_vec.stride = cl_uint(viennacl::traits::stride(vec)); 00390 size_vec.size = cl_uint(viennacl::traits::size(vec)); 00391 size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00392 00393 00394 viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu"); 00395 viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec), 00396 size_vec, 00397 00398 viennacl::traits::opencl_handle(NumericT(1)), 00399 options_alpha, 00400 viennacl::traits::opencl_handle(mat), 00401 size_mat) 00402 ); 00403 } 00404 00405 template <typename NumericT, typename F> 00406 void matrix_column(const matrix_base<NumericT, F> & mat, unsigned int j, vector_base<NumericT> & vec) 00407 { 00408 // reuse vector ambm kernel for assigning the elements: 00409 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context()); 00410 typedef viennacl::linalg::opencl::kernels::vector<NumericT> KernelClass; 00411 KernelClass::init(ctx); 00412 00413 cl_uint options_alpha = 0; 00414 viennacl::ocl::packed_cl_uint size_mat; 00415 if (viennacl::is_row_major<F>::value) 00416 { 00417 size_mat.start = cl_uint(viennacl::traits::start1(mat) * viennacl::traits::internal_size2(mat) + viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)); 00418 size_mat.stride = cl_uint(viennacl::traits::stride2(mat) * viennacl::traits::internal_size2(mat)); 00419 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00420 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00421 } 00422 else 00423 { 00424 size_mat.start = cl_uint(viennacl::traits::start1(mat) + (viennacl::traits::start2(mat) + j * viennacl::traits::stride2(mat)) * viennacl::traits::internal_size1(mat)); 00425 size_mat.stride = cl_uint(viennacl::traits::stride2(mat)); 00426 size_mat.size = cl_uint(viennacl::traits::size(vec)); 00427 size_mat.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00428 } 00429 00430 viennacl::ocl::packed_cl_uint size_vec; 00431 size_vec.start = cl_uint(viennacl::traits::start(vec)); 00432 size_vec.stride = cl_uint(viennacl::traits::stride(vec)); 00433 size_vec.size = cl_uint(viennacl::traits::size(vec)); 00434 size_vec.internal_size = cl_uint(viennacl::traits::internal_size(vec)); 00435 00436 00437 viennacl::ocl::kernel & kern = ctx.get_kernel(KernelClass::program_name(), "av_cpu"); 00438 viennacl::ocl::enqueue(kern(viennacl::traits::opencl_handle(vec), 00439 size_vec, 00440 00441 viennacl::traits::opencl_handle(NumericT(1)), 00442 options_alpha, 00443 viennacl::traits::opencl_handle(mat), 00444 size_mat) 00445 ); 00446 } 00447 00448 00449 // 00451 // 00452 00453 // Binary operations A = B .* C and A = B ./ C 00459 template <typename T, typename F, typename OP> 00460 void element_op(matrix_base<T, F> & A, 00461 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_binary<OP> > const & proxy) 00462 { 00463 assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!")); 00464 assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!")); 00465 00466 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00467 typedef viennacl::linalg::opencl::kernels::matrix<T, F> KernelClass; 00468 KernelClass::init(ctx); 00469 00470 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "element_op"); 00471 00472 cl_uint op_type = 2; //0: product, 1: division, 2: power 00473 if (viennacl::is_division<OP>::value) 00474 op_type = 1; 00475 else if (viennacl::is_product<OP>::value) 00476 op_type = 0; 00477 00478 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A), 00479 cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)), 00480 cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)), 00481 cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)), 00482 cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)), 00483 00484 viennacl::traits::opencl_handle(proxy.lhs()), 00485 cl_uint(viennacl::traits::start1(proxy.lhs())), cl_uint(viennacl::traits::start2(proxy.lhs())), 00486 cl_uint(viennacl::traits::stride1(proxy.lhs())), cl_uint(viennacl::traits::stride2(proxy.lhs())), 00487 cl_uint(viennacl::traits::internal_size1(proxy.lhs())), cl_uint(viennacl::traits::internal_size2(proxy.lhs())), 00488 00489 viennacl::traits::opencl_handle(proxy.rhs()), 00490 cl_uint(viennacl::traits::start1(proxy.rhs())), cl_uint(viennacl::traits::start2(proxy.rhs())), 00491 cl_uint(viennacl::traits::stride1(proxy.rhs())), cl_uint(viennacl::traits::stride2(proxy.rhs())), 00492 cl_uint(viennacl::traits::internal_size1(proxy.rhs())), cl_uint(viennacl::traits::internal_size2(proxy.rhs())), 00493 00494 op_type) 00495 ); 00496 } 00497 00498 00499 // Unary operations 00500 00506 template <typename T, typename F, typename OP> 00507 void element_op(matrix_base<T, F> & A, 00508 matrix_expression<const matrix_base<T, F>, const matrix_base<T, F>, op_element_unary<OP> > const & proxy) 00509 { 00510 assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.lhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!")); 00511 assert(viennacl::traits::opencl_handle(A).context() == viennacl::traits::opencl_handle(proxy.rhs()).context() && bool("Vectors do not reside in the same OpenCL context. Automatic migration not yet supported!")); 00512 00513 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00514 00515 viennacl::linalg::opencl::kernels::matrix_element<T, F>::init(ctx); 00516 viennacl::ocl::kernel & k = ctx.get_kernel(viennacl::linalg::opencl::kernels::matrix_element<T, F>::program_name(), detail::op_to_string(OP()) + "_assign"); 00517 00518 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(A), 00519 cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)), 00520 cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)), 00521 cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)), 00522 cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)), 00523 00524 viennacl::traits::opencl_handle(proxy.lhs()), 00525 cl_uint(viennacl::traits::start1(proxy.lhs())), cl_uint(viennacl::traits::start2(proxy.lhs())), 00526 cl_uint(viennacl::traits::stride1(proxy.lhs())), cl_uint(viennacl::traits::stride2(proxy.lhs())), 00527 cl_uint(viennacl::traits::internal_size1(proxy.lhs())), cl_uint(viennacl::traits::internal_size2(proxy.lhs()))) 00528 ); 00529 } 00530 00531 00532 // 00534 // 00535 00536 // A * x 00537 00546 template <typename NumericT, typename F> 00547 void prod_impl(const matrix_base<NumericT, F> & mat, 00548 const vector_base<NumericT> & vec, 00549 vector_base<NumericT> & result) 00550 { 00551 typedef NumericT value_type; 00552 00553 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat).context()); 00554 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00555 KernelClass::init(ctx); 00556 00557 assert(mat.size2() == vec.size()); 00558 // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead 00559 assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace matrix-vector product possible. Introduce a temporary!")); 00560 //result.resize(mat.size1()); 00561 00562 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "vec_mul"); 00563 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat), 00564 cl_uint(viennacl::traits::start1(mat)), cl_uint(viennacl::traits::start2(mat)), 00565 cl_uint(viennacl::traits::stride1(mat)), cl_uint(viennacl::traits::stride2(mat)), 00566 cl_uint(viennacl::traits::size1(mat)), cl_uint(viennacl::traits::size2(mat)), 00567 cl_uint(viennacl::traits::internal_size1(mat)), cl_uint(viennacl::traits::internal_size2(mat)), 00568 00569 viennacl::traits::opencl_handle(vec), 00570 cl_uint(viennacl::traits::start(vec)), 00571 cl_uint(viennacl::traits::stride(vec)), 00572 cl_uint(viennacl::traits::size(vec)), 00573 00574 viennacl::traits::opencl_handle(result), 00575 cl_uint(viennacl::traits::start(result)), 00576 cl_uint(viennacl::traits::stride(result)), 00577 cl_uint(viennacl::traits::size(result)), 00578 00579 viennacl::ocl::local_mem(sizeof(value_type) * k.local_work_size()) 00580 ) ); 00581 } 00582 00583 00584 // trans(A) * x 00585 00594 template <typename NumericT, typename F> 00595 void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F>, const matrix_base<NumericT, F>, op_trans> & mat_trans, 00596 const vector_base<NumericT> & vec, 00597 vector_base<NumericT> & result) 00598 { 00599 assert( (viennacl::traits::size1(mat_trans) == viennacl::traits::size(result)) && bool("Size check failed for transposed matrix-vector product: size1(A^T) == size(result)")); 00600 assert( (viennacl::traits::size2(mat_trans) == viennacl::traits::size(vec)) && bool("Size check failed for transposed matrix-vector product: size2(A^T) == size(x)")); //remember: mat is transposed! 00601 00602 00603 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(vec).context()); 00604 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00605 KernelClass::init(ctx); 00606 00607 00608 // Inplace matrix-vector products like x = prod(A, x) are currently illegal: Introduce a temporary like y = prod(A, x); x = y; instead 00609 assert(viennacl::traits::handle(vec) != viennacl::traits::handle(result) && bool("No direct inplace transposed matrix-vector product possible. Introduce a temporary!")); 00610 00611 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), "trans_vec_mul"); 00612 00613 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat_trans.lhs()), 00614 cl_uint(viennacl::traits::start1(mat_trans.lhs())), cl_uint(viennacl::traits::start2(mat_trans.lhs())), 00615 cl_uint(viennacl::traits::stride1(mat_trans.lhs())), cl_uint(viennacl::traits::stride2(mat_trans.lhs())), 00616 cl_uint(viennacl::traits::size1(mat_trans.lhs())), cl_uint(viennacl::traits::size2(mat_trans.lhs())), 00617 cl_uint(viennacl::traits::internal_size1(mat_trans.lhs())), cl_uint(viennacl::traits::internal_size2(mat_trans.lhs())), 00618 00619 viennacl::traits::opencl_handle(vec), 00620 cl_uint(viennacl::traits::start(vec)), 00621 cl_uint(viennacl::traits::stride(vec)), 00622 cl_uint(viennacl::traits::size(vec)), 00623 00624 viennacl::traits::opencl_handle(result), 00625 cl_uint(viennacl::traits::start(result)), 00626 cl_uint(viennacl::traits::stride(result)), 00627 cl_uint(viennacl::traits::size(result)), 00628 00629 viennacl::ocl::local_mem(sizeof(NumericT) * k.local_work_size()) 00630 ) ); 00631 } 00632 00633 00634 // 00636 // 00637 00638 namespace detail 00639 { 00640 // C = A * B and possibly transposed variants 00641 template <typename T1, typename T2, typename T3, typename ScalarType > 00642 void prod_slow_kernel(const T1 & A, 00643 const T2 & B, 00644 T3 & C, 00645 ScalarType alpha, 00646 ScalarType beta, 00647 std::string kernel_name) 00648 { 00649 typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type cpu_value_type; 00650 typedef typename viennacl::result_of::orientation_functor<T1>::type orientation_A; 00651 typedef typename viennacl::result_of::orientation_functor<T2>::type orientation_B; 00652 typedef typename viennacl::result_of::orientation_functor<T3>::type orientation_C; 00653 00654 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00655 00656 typedef viennacl::linalg::opencl::kernels::matrix_prod<cpu_value_type, orientation_A, orientation_B, orientation_C> KernelClass; 00657 KernelClass::init(ctx); 00658 00659 //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl; 00660 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name); 00661 00662 k.global_work_size(0, viennacl::tools::align_to_multiple<unsigned int>(static_cast<unsigned int>(viennacl::traits::size1(C)), 16)); 00663 k.global_work_size(1, viennacl::tools::align_to_multiple<unsigned int>(static_cast<unsigned int>(viennacl::traits::size2(C)), 16)); 00664 k.local_work_size(0, 16); 00665 k.local_work_size(1, 16); 00666 00667 cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha); 00668 cpu_value_type cl_beta = static_cast<cpu_value_type>(beta); 00669 00670 viennacl::ocl::enqueue(k(cl_alpha, 00671 viennacl::traits::opencl_handle(A), 00672 cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)), 00673 cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)), 00674 cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)), 00675 cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)), 00676 00677 viennacl::traits::opencl_handle(B), 00678 cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)), 00679 cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)), 00680 cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)), 00681 cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)), 00682 00683 cl_beta, 00684 viennacl::traits::opencl_handle(C), 00685 cl_uint(viennacl::traits::start1(C)), cl_uint(viennacl::traits::start2(C)), 00686 cl_uint(viennacl::traits::stride1(C)), cl_uint(viennacl::traits::stride2(C)), 00687 cl_uint(viennacl::traits::size1(C)), cl_uint(viennacl::traits::size2(C)), 00688 cl_uint(viennacl::traits::internal_size1(C)), cl_uint(viennacl::traits::internal_size2(C)) 00689 ) 00690 ); 00691 } 00692 00693 // C = A * B, using fast kernel for NVIDIA 00694 template <typename T1, typename T2, typename T3, typename ScalarType > 00695 void prod_fast_kernel(const T1 & A, 00696 const T2 & B, 00697 T3 & C, 00698 ScalarType alpha, 00699 ScalarType beta, 00700 std::string kernel_name) 00701 { 00702 typedef typename viennacl::result_of::cpu_value_type< typename T1::value_type >::type cpu_value_type; 00703 typedef typename viennacl::result_of::orientation_functor<T1>::type orientation_A; 00704 typedef typename viennacl::result_of::orientation_functor<T2>::type orientation_B; 00705 typedef typename viennacl::result_of::orientation_functor<T3>::type orientation_C; 00706 00707 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context()); 00708 00709 typedef viennacl::linalg::opencl::kernels::matrix_prod<cpu_value_type, orientation_A, orientation_B, orientation_C> KernelClass; 00710 KernelClass::init(ctx); 00711 00712 //std::cout << "KernelClass::program_name() : " << KernelClass::program_name() << std::endl; 00713 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), kernel_name); 00714 00715 k.global_work_size(0, viennacl::traits::size2(C) / 4); //column blocks 00716 k.global_work_size(1, viennacl::traits::size1(C) / 4); //row blocks 00717 k.local_work_size(0, 16); //columns 00718 k.local_work_size(1, 4); //rows 00719 00720 cpu_value_type cl_alpha = static_cast<cpu_value_type>(alpha); 00721 cpu_value_type cl_beta = static_cast<cpu_value_type>(beta); 00722 00723 viennacl::ocl::enqueue(k(cl_alpha, 00724 viennacl::traits::opencl_handle(A), 00725 cl_uint(viennacl::traits::start1(A)), cl_uint(viennacl::traits::start2(A)), 00726 cl_uint(viennacl::traits::stride1(A)), cl_uint(viennacl::traits::stride2(A)), 00727 cl_uint(viennacl::traits::size1(A)), cl_uint(viennacl::traits::size2(A)), 00728 cl_uint(viennacl::traits::internal_size1(A)), cl_uint(viennacl::traits::internal_size2(A)), 00729 00730 viennacl::traits::opencl_handle(B), 00731 cl_uint(viennacl::traits::start1(B)), cl_uint(viennacl::traits::start2(B)), 00732 cl_uint(viennacl::traits::stride1(B)), cl_uint(viennacl::traits::stride2(B)), 00733 cl_uint(viennacl::traits::size1(B)), cl_uint(viennacl::traits::size2(B)), 00734 cl_uint(viennacl::traits::internal_size1(B)), cl_uint(viennacl::traits::internal_size2(B)), 00735 00736 cl_beta, 00737 viennacl::traits::opencl_handle(C), 00738 cl_uint(viennacl::traits::start1(C)), cl_uint(viennacl::traits::start2(C)), 00739 cl_uint(viennacl::traits::stride1(C)), cl_uint(viennacl::traits::stride2(C)), 00740 cl_uint(viennacl::traits::size1(C)), cl_uint(viennacl::traits::size2(C)), 00741 cl_uint(viennacl::traits::internal_size1(C)), cl_uint(viennacl::traits::internal_size2(C)) 00742 ) 00743 ); 00744 } 00745 00746 template <typename T1, typename T2, typename T3, typename ScalarType > 00747 void prod(const T1 & A, 00748 const T2 & B, 00749 T3 & C, 00750 ScalarType alpha, 00751 ScalarType beta, 00752 std::string fast_kernel_name, 00753 std::string slow_kernel_name) 00754 { 00755 if ( (viennacl::traits::size1(A) < 64) 00756 || (viennacl::traits::size2(A) < 64) 00757 || (viennacl::traits::size1(B) < 64) 00758 || (viennacl::traits::size2(B) < 64) ) //there is most likely not enough to compute, rendering kernel launch overhead considerable 00759 { 00760 prod_slow_kernel(A, B, C, alpha, beta, slow_kernel_name); 00761 } 00762 else if ( (viennacl::traits::size1(A) % 64 == 0) 00763 && (viennacl::traits::size2(A) % 64 == 0) 00764 && (viennacl::traits::size1(B) % 64 == 0) 00765 && (viennacl::traits::size2(B) % 64 == 0) ) // allows the use of the fast NVIDIA kernel 00766 { 00767 prod_fast_kernel(A, B, C, alpha, beta, fast_kernel_name); 00768 //prod_slow_kernel(A, B, C, slow_kernel_name); 00769 } 00770 else //TODO: use four kernels 00771 { 00772 prod_slow_kernel(A, B, C, alpha, beta, slow_kernel_name); 00773 } 00774 00775 } 00776 } // namespace detail 00777 00778 00784 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 00785 void prod_impl(const matrix_base<NumericT, F1> & A, 00786 const matrix_base<NumericT, F2> & B, 00787 matrix_base<NumericT, F3> & C, 00788 ScalarType alpha, 00789 ScalarType beta) 00790 { 00791 assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, B): size1(A) != size1(C)")); 00792 assert( (viennacl::traits::size2(A) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(A, B): size2(A) != size1(B)")); 00793 assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, B): size2(B) != size2(C)")); 00794 00795 bool A_not_aligned = (A.internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.internal_size2()%matrix_base<NumericT, F1>::alignment>0); 00796 bool B_not_aligned = (B.internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.internal_size2()%matrix_base<NumericT, F2>::alignment>0); 00797 bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0); 00798 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 00799 /*assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A)) 00800 && (viennacl::traits::handle(C) != viennacl::traits::handle(B)) 00801 && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/ 00802 00803 if(A_not_aligned || A.start1() > 0 || A.start2() > 0 || A.stride1() > 1 || A.stride2() > 1 00804 ||B_not_aligned || B.start1() > 0 || B.start2() > 0 || B.stride1() > 1 || B.stride2() > 1 00805 ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1) 00806 detail::prod(A, B, C, alpha, beta, "prod16_AA", "prod_AA"); 00807 else{ 00808 typedef matrix_expression<const matrix_base<NumericT, F1>, const matrix_base<NumericT, F2>, op_mat_mat_prod> ProdType; 00809 viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C)); 00810 } 00811 } 00812 00813 00814 00820 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 00821 void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, 00822 const matrix_base<NumericT, F1>, 00823 op_trans> & A, 00824 const matrix_base<NumericT, F2> & B, 00825 matrix_base<NumericT, F3> & C, 00826 ScalarType alpha, 00827 ScalarType beta) 00828 { 00829 //std::cout << "size2(A): " << viennacl::traits::size2(A.lhs()) << std::endl; 00830 //std::cout << "size1(C): " << viennacl::traits::size1(C) << std::endl; 00831 assert( (viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(A) != size1(C)")); 00832 assert( (viennacl::traits::size1(A.lhs()) == viennacl::traits::size1(B)) && bool("Size mismatch in C = prod(trans(A), B): size1(A) != size1(B)")); 00833 assert( (viennacl::traits::size2(B) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(trans(A), B): size2(B) != size2(C)")); 00834 00835 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 00836 /*assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs())) 00837 && (viennacl::traits::handle(C) != viennacl::traits::handle(B)) 00838 && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/ 00839 00840 bool A_not_aligned = (A.lhs().internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.lhs().internal_size2()%matrix_base<NumericT, F1>::alignment>0); 00841 bool B_not_aligned = (B.internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.internal_size2()%matrix_base<NumericT, F2>::alignment>0); 00842 bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0); 00843 00844 00845 if(A_not_aligned || A.lhs().start1() > 0 || A.lhs().start2() > 0 || A.lhs().stride1() > 1 || A.lhs().stride2() > 1 00846 ||B_not_aligned || B.start1() > 0 || B.start2() > 0 || B.stride1() > 1 || B.stride2() > 1 00847 ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1) 00848 detail::prod(A.lhs(), B, C, alpha, beta, "prod16_TA", "prod_TA"); 00849 else{ 00850 typedef const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> LhsType; 00851 typedef matrix_expression<LhsType, const matrix_base<NumericT, F2>, op_mat_mat_prod> ProdType; 00852 viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C)); 00853 } 00854 } 00855 00856 00857 00858 00864 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 00865 void prod_impl(const matrix_base<NumericT, F1> & A, 00866 const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B, 00867 matrix_base<NumericT, F3> & C, 00868 ScalarType alpha, 00869 ScalarType beta) 00870 { 00871 assert( (viennacl::traits::size1(A) == viennacl::traits::size1(C)) && bool("Size mismatch in C = prod(A, trans(B)): size1(A) != size1(C)")); 00872 assert( (viennacl::traits::size2(A) == viennacl::traits::size2(B.lhs())) && bool("Size mismatch in C = prod(A, trans(B)): size2(A) != size2(B)")); 00873 assert( (viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C)) && bool("Size mismatch in C = prod(A, trans(B)): size1(B) != size2(C)")); 00874 00875 bool A_not_aligned = (A.internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.internal_size2()%matrix_base<NumericT, F1>::alignment>0); 00876 bool B_not_aligned = (B.lhs().internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.lhs().internal_size2()%matrix_base<NumericT, F2>::alignment>0); 00877 bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0); 00878 00879 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 00880 /*assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A)) 00881 && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs())) 00882 && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/ 00883 00884 if(A_not_aligned || A.start1() > 0 || A.start2() > 0 || A.stride1() > 1 || A.stride2() > 1 00885 ||B_not_aligned || B.lhs().start1() > 0 || B.lhs().start2() > 0 || B.lhs().stride1() > 1 || B.lhs().stride2() > 1 00886 ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1) 00887 detail::prod(A, B.lhs(), C, alpha, beta, "prod16_AT", "prod_AT"); 00888 else{ 00889 typedef const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> RhsType; 00890 typedef matrix_expression<const matrix_base<NumericT, F1>, RhsType, op_mat_mat_prod> ProdType; 00891 viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C)); 00892 } 00893 } 00894 00895 00896 00902 template <typename NumericT, typename F1, typename F2, typename F3, typename ScalarType > 00903 void prod_impl(const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> & A, 00904 const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> & B, 00905 matrix_base<NumericT, F3> & C, 00906 ScalarType alpha, 00907 ScalarType beta) 00908 { 00909 assert(viennacl::traits::size2(A.lhs()) == viennacl::traits::size1(C) && bool("Size mismatch in C = prod(trans(A), trans(B)): size2(A) != size1(C)")); 00910 assert(viennacl::traits::size1(A.lhs()) == viennacl::traits::size2(B.lhs()) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(A) != size2(B)")); 00911 assert(viennacl::traits::size1(B.lhs()) == viennacl::traits::size2(C) && bool("Size mismatch in C = prod(trans(A), trans(B)): size1(B) != size2(C)")); 00912 00913 bool A_not_aligned = (A.lhs().internal_size1()%matrix_base<NumericT, F1>::alignment>0) ||(A.lhs().internal_size2()%matrix_base<NumericT, F1>::alignment>0); 00914 bool B_not_aligned = (B.lhs().internal_size1()%matrix_base<NumericT, F2>::alignment>0) ||(B.lhs().internal_size2()%matrix_base<NumericT, F2>::alignment>0); 00915 bool C_not_aligned = (C.internal_size1()%matrix_base<NumericT, F3>::alignment>0) ||(C.internal_size2()%matrix_base<NumericT, F3>::alignment>0); 00916 00917 // Inplace matrix-vector products like B = prod(A, B) are currently illegal: Introduce a temporary like C = prod(A, B); B = C; instead 00918 /*assert( (viennacl::traits::handle(C) != viennacl::traits::handle(A.lhs())) 00919 && (viennacl::traits::handle(C) != viennacl::traits::handle(B.lhs())) 00920 && bool("No direct inplace matrix-matrix product possible. Introduce a temporary!"));*/ 00921 00922 if(A_not_aligned || A.lhs().start1() > 0 || A.lhs().start2() > 0 || A.lhs().stride1() > 1 || A.lhs().stride2() > 1 00923 ||B_not_aligned || B.lhs().start1() > 0 || B.lhs().start2() > 0 || B.lhs().stride1() > 1 || B.lhs().stride2() > 1 00924 ||C_not_aligned || C.start1() > 0 || C.start2() > 0 || C.stride1() > 1 || C.stride2() > 1) 00925 detail::prod(A.lhs(), B.lhs(), C, alpha, beta, "prod16_TT", "prod_TT"); 00926 else{ 00927 typedef const viennacl::matrix_expression< const matrix_base<NumericT, F1>, const matrix_base<NumericT, F1>, op_trans> LhsType; 00928 typedef const viennacl::matrix_expression< const matrix_base<NumericT, F2>, const matrix_base<NumericT, F2>, op_trans> RhsType; 00929 typedef matrix_expression<LhsType, RhsType, op_mat_mat_prod> ProdType; 00930 viennacl::generator::generate_enqueue_statement(viennacl::scheduler::statement(C, viennacl::op_assign(),alpha*ProdType(A,B)+beta*C)); 00931 } 00932 } 00933 00934 00935 00936 00937 // 00939 // 00940 00941 00954 template <typename NumericT, typename F, typename S1> 00955 void scaled_rank_1_update(matrix_base<NumericT, F> & mat1, 00956 S1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, 00957 const vector_base<NumericT> & vec1, 00958 const vector_base<NumericT> & vec2) 00959 { 00960 assert( (viennacl::traits::size1(mat1) == viennacl::traits::size(vec1)) && bool("Size mismatch in scaled_rank_1_update: size1(A) != size(v1)")); 00961 assert( (viennacl::traits::size2(mat1) == viennacl::traits::size(vec2)) && bool("Size mismatch in scaled_rank_1_update: size2(A) != size(v2)")); 00962 00963 viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(mat1).context()); 00964 typedef viennacl::linalg::opencl::kernels::matrix<NumericT, F> KernelClass; 00965 KernelClass::init(ctx); 00966 00967 cl_uint options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha); 00968 00969 viennacl::ocl::kernel & k = ctx.get_kernel(KernelClass::program_name(), viennacl::is_cpu_scalar<S1>::value ? "scaled_rank1_update_cpu" : "scaled_rank1_update_gpu"); 00970 00971 viennacl::ocl::enqueue(k(viennacl::traits::opencl_handle(mat1), 00972 cl_uint(viennacl::traits::start1(mat1)), cl_uint(viennacl::traits::start2(mat1)), 00973 cl_uint(viennacl::traits::stride1(mat1)), cl_uint(viennacl::traits::stride2(mat1)), 00974 cl_uint(viennacl::traits::size1(mat1)), cl_uint(viennacl::traits::size2(mat1)), 00975 cl_uint(viennacl::traits::internal_size1(mat1)), cl_uint(viennacl::traits::internal_size2(mat1)), 00976 00977 viennacl::traits::opencl_handle(viennacl::tools::promote_if_host_scalar<NumericT>(alpha)), 00978 options_alpha, 00979 00980 viennacl::traits::opencl_handle(vec1), 00981 cl_uint(viennacl::traits::start(vec1)), 00982 cl_uint(viennacl::traits::stride(vec1)), 00983 cl_uint(viennacl::traits::size(vec1)), 00984 00985 viennacl::traits::opencl_handle(vec2), 00986 cl_uint(viennacl::traits::start(vec2)), 00987 cl_uint(viennacl::traits::stride(vec2)), 00988 cl_uint(viennacl::traits::size(vec2)) 00989 ) 00990 ); 00991 } 00992 00993 } // namespace opencl 00994 } //namespace linalg 00995 } //namespace viennacl 00996 00997 00998 #endif