ViennaCL - The Vienna Computing Library
1.5.2
|
00001 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ 00002 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_ROW_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00026 namespace viennacl 00027 { 00028 namespace linalg 00029 { 00030 namespace cuda 00031 { 00032 // 00033 // am 00034 // 00035 00036 // alpha on CPU 00037 template <typename T> 00038 __global__ void am_row_kernel( 00039 T * A, 00040 unsigned int A_start1, unsigned int A_start2, 00041 unsigned int A_inc1, unsigned int A_inc2, 00042 unsigned int A_size1, unsigned int A_size2, 00043 unsigned int A_internal_size1, unsigned int A_internal_size2, 00044 00045 T fac2, 00046 unsigned int options2, 00047 const T * B, 00048 unsigned int B_start1, unsigned int B_start2, 00049 unsigned int B_inc1, unsigned int B_inc2, 00050 unsigned int B_internal_size1, unsigned int B_internal_size2) 00051 { 00052 T alpha = fac2; 00053 if (options2 & (1 << 0)) 00054 alpha = -alpha; 00055 00056 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00057 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00058 00059 if (options2 & (1 << 1)) 00060 { 00061 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00062 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00063 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha; 00064 } 00065 else 00066 { 00067 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00068 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00069 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha; 00070 } 00071 } 00072 00073 // alpha on GPU 00074 template <typename T> 00075 __global__ void am_row_kernel( 00076 T * A, 00077 unsigned int A_start1, unsigned int A_start2, 00078 unsigned int A_inc1, unsigned int A_inc2, 00079 unsigned int A_size1, unsigned int A_size2, 00080 unsigned int A_internal_size1, unsigned int A_internal_size2, 00081 00082 const T * fac2, 00083 unsigned int options2, 00084 const T * B, 00085 unsigned int B_start1, unsigned int B_start2, 00086 unsigned int B_inc1, unsigned int B_inc2, 00087 unsigned int B_internal_size1, unsigned int B_internal_size2) 00088 { 00089 T alpha = *fac2; 00090 if (options2 & (1 << 0)) 00091 alpha = -alpha; 00092 00093 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00094 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00095 00096 if (options2 & (1 << 1)) 00097 { 00098 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00099 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00100 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha; 00101 } 00102 else 00103 { 00104 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00105 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00106 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha; 00107 } 00108 } 00109 00110 00111 // 00112 // ambm 00113 // 00114 00115 // alpha and beta on CPU 00116 template <typename T> 00117 __global__ void ambm_row_kernel( 00118 T * A, 00119 unsigned int A_start1, unsigned int A_start2, 00120 unsigned int A_inc1, unsigned int A_inc2, 00121 unsigned int A_size1, unsigned int A_size2, 00122 unsigned int A_internal_size1, unsigned int A_internal_size2, 00123 00124 T fac2, 00125 unsigned int options2, 00126 const T * B, 00127 unsigned int B_start1, unsigned int B_start2, 00128 unsigned int B_inc1, unsigned int B_inc2, 00129 unsigned int B_internal_size1, unsigned int B_internal_size2, 00130 00131 T fac3, 00132 unsigned int options3, 00133 const T * C, 00134 unsigned int C_start1, unsigned int C_start2, 00135 unsigned int C_inc1, unsigned int C_inc2, 00136 unsigned int C_internal_size1, unsigned int C_internal_size2) 00137 { 00138 T alpha = fac2; 00139 if (options2 & (1 << 0)) 00140 alpha = -alpha; 00141 00142 T beta = fac3; 00143 if (options3 & (1 << 0)) 00144 beta = -beta; 00145 00146 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00147 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00148 00149 if (options2 & (1 << 1)) 00150 { 00151 if (options3 & (1 << 1)) 00152 { 00153 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00154 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00155 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00156 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00157 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00158 } 00159 else 00160 { 00161 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00162 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00163 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00164 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00165 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00166 } 00167 } 00168 else 00169 { 00170 if (options3 & (1 << 1)) 00171 { 00172 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00173 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00174 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00175 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00176 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00177 } 00178 else 00179 { 00180 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00181 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00182 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00183 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00184 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00185 } 00186 } 00187 } 00188 00189 00190 // alpha on CPU, beta on GPU 00191 template <typename T> 00192 __global__ void ambm_row_kernel( 00193 T * A, 00194 unsigned int A_start1, unsigned int A_start2, 00195 unsigned int A_inc1, unsigned int A_inc2, 00196 unsigned int A_size1, unsigned int A_size2, 00197 unsigned int A_internal_size1, unsigned int A_internal_size2, 00198 00199 T fac2, 00200 unsigned int options2, 00201 const T * B, 00202 unsigned int B_start1, unsigned int B_start2, 00203 unsigned int B_inc1, unsigned int B_inc2, 00204 unsigned int B_internal_size1, unsigned int B_internal_size2, 00205 00206 const T * fac3, 00207 unsigned int options3, 00208 const T * C, 00209 unsigned int C_start1, unsigned int C_start2, 00210 unsigned int C_inc1, unsigned int C_inc2, 00211 unsigned int C_internal_size1, unsigned int C_internal_size2) 00212 { 00213 T alpha = fac2; 00214 if (options2 & (1 << 0)) 00215 alpha = -alpha; 00216 00217 T beta = *fac3; 00218 if (options3 & (1 << 0)) 00219 beta = -beta; 00220 00221 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00222 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00223 00224 if (options2 & (1 << 1)) 00225 { 00226 if (options3 & (1 << 1)) 00227 { 00228 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00229 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00230 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00231 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00232 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00233 } 00234 else 00235 { 00236 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00237 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00238 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00239 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00240 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00241 } 00242 } 00243 else 00244 { 00245 if (options3 & (1 << 1)) 00246 { 00247 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00248 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00249 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00250 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00251 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00252 } 00253 else 00254 { 00255 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00256 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00257 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00258 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00259 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00260 } 00261 } 00262 } 00263 00264 // alpha on GPU, beta on CPU 00265 template <typename T> 00266 __global__ void ambm_row_kernel( 00267 T * A, 00268 unsigned int A_start1, unsigned int A_start2, 00269 unsigned int A_inc1, unsigned int A_inc2, 00270 unsigned int A_size1, unsigned int A_size2, 00271 unsigned int A_internal_size1, unsigned int A_internal_size2, 00272 00273 const T * fac2, 00274 unsigned int options2, 00275 const T * B, 00276 unsigned int B_start1, unsigned int B_start2, 00277 unsigned int B_inc1, unsigned int B_inc2, 00278 unsigned int B_internal_size1, unsigned int B_internal_size2, 00279 00280 T fac3, 00281 unsigned int options3, 00282 const T * C, 00283 unsigned int C_start1, unsigned int C_start2, 00284 unsigned int C_inc1, unsigned int C_inc2, 00285 unsigned int C_internal_size1, unsigned int C_internal_size2) 00286 { 00287 T alpha = *fac2; 00288 if (options2 & (1 << 0)) 00289 alpha = -alpha; 00290 00291 T beta = fac3; 00292 if (options3 & (1 << 0)) 00293 beta = -beta; 00294 00295 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00296 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00297 00298 if (options2 & (1 << 1)) 00299 { 00300 if (options3 & (1 << 1)) 00301 { 00302 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00303 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00304 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00305 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00306 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00307 } 00308 else 00309 { 00310 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00311 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00312 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00313 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00314 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00315 } 00316 } 00317 else 00318 { 00319 if (options3 & (1 << 1)) 00320 { 00321 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00322 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00323 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00324 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00325 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00326 } 00327 else 00328 { 00329 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00330 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00331 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00332 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00333 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00334 } 00335 } 00336 } 00337 00338 00339 // alpha and beta on GPU 00340 template <typename T> 00341 __global__ void ambm_row_kernel( 00342 T * A, 00343 unsigned int A_start1, unsigned int A_start2, 00344 unsigned int A_inc1, unsigned int A_inc2, 00345 unsigned int A_size1, unsigned int A_size2, 00346 unsigned int A_internal_size1, unsigned int A_internal_size2, 00347 00348 const T * fac2, 00349 unsigned int options2, 00350 const T * B, 00351 unsigned int B_start1, unsigned int B_start2, 00352 unsigned int B_inc1, unsigned int B_inc2, 00353 unsigned int B_internal_size1, unsigned int B_internal_size2, 00354 00355 const T * fac3, 00356 unsigned int options3, 00357 const T * C, 00358 unsigned int C_start1, unsigned int C_start2, 00359 unsigned int C_inc1, unsigned int C_inc2, 00360 unsigned int C_internal_size1, unsigned int C_internal_size2) 00361 { 00362 T alpha = *fac2; 00363 if (options2 & (1 << 0)) 00364 alpha = -alpha; 00365 00366 T beta = *fac3; 00367 if (options3 & (1 << 0)) 00368 beta = -beta; 00369 00370 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00371 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00372 00373 if (options2 & (1 << 1)) 00374 { 00375 if (options3 & (1 << 1)) 00376 { 00377 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00378 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00379 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00380 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00381 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00382 } 00383 else 00384 { 00385 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00386 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00387 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00388 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00389 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00390 } 00391 } 00392 else 00393 { 00394 if (options3 & (1 << 1)) 00395 { 00396 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00397 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00398 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00399 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00400 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00401 } 00402 else 00403 { 00404 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00405 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00406 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00407 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00408 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00409 } 00410 } 00411 } 00412 00413 00414 // 00415 // ambm_m 00416 // 00417 00418 // alpha and beta on CPU 00419 template <typename T> 00420 __global__ void ambm_m_row_kernel( 00421 T * A, 00422 unsigned int A_start1, unsigned int A_start2, 00423 unsigned int A_inc1, unsigned int A_inc2, 00424 unsigned int A_size1, unsigned int A_size2, 00425 unsigned int A_internal_size1, unsigned int A_internal_size2, 00426 00427 T fac2, 00428 unsigned int options2, 00429 const T * B, 00430 unsigned int B_start1, unsigned int B_start2, 00431 unsigned int B_inc1, unsigned int B_inc2, 00432 unsigned int B_internal_size1, unsigned int B_internal_size2, 00433 00434 T fac3, 00435 unsigned int options3, 00436 const T * C, 00437 unsigned int C_start1, unsigned int C_start2, 00438 unsigned int C_inc1, unsigned int C_inc2, 00439 unsigned int C_internal_size1, unsigned int C_internal_size2) 00440 { 00441 T alpha = fac2; 00442 if (options2 & (1 << 0)) 00443 alpha = -alpha; 00444 00445 T beta = fac3; 00446 if (options3 & (1 << 0)) 00447 beta = -beta; 00448 00449 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00450 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00451 00452 if (options2 & (1 << 1)) 00453 { 00454 if (options3 & (1 << 1)) 00455 { 00456 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00457 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00458 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00459 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00460 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00461 } 00462 else 00463 { 00464 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00465 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00466 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00467 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00468 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00469 } 00470 } 00471 else 00472 { 00473 if (options3 & (1 << 1)) 00474 { 00475 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00476 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00477 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00478 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00479 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00480 } 00481 else 00482 { 00483 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00484 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00485 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00486 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00487 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00488 } 00489 } 00490 } 00491 00492 00493 // alpha on CPU, beta on GPU 00494 template <typename T> 00495 __global__ void ambm_m_row_kernel( 00496 T * A, 00497 unsigned int A_start1, unsigned int A_start2, 00498 unsigned int A_inc1, unsigned int A_inc2, 00499 unsigned int A_size1, unsigned int A_size2, 00500 unsigned int A_internal_size1, unsigned int A_internal_size2, 00501 00502 T fac2, 00503 unsigned int options2, 00504 const T * B, 00505 unsigned int B_start1, unsigned int B_start2, 00506 unsigned int B_inc1, unsigned int B_inc2, 00507 unsigned int B_internal_size1, unsigned int B_internal_size2, 00508 00509 const T * fac3, 00510 unsigned int options3, 00511 const T * C, 00512 unsigned int C_start1, unsigned int C_start2, 00513 unsigned int C_inc1, unsigned int C_inc2, 00514 unsigned int C_internal_size1, unsigned int C_internal_size2) 00515 { 00516 T alpha = fac2; 00517 if (options2 & (1 << 0)) 00518 alpha = -alpha; 00519 00520 T beta = *fac3; 00521 if (options3 & (1 << 0)) 00522 beta = -beta; 00523 00524 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00525 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00526 00527 if (options2 & (1 << 1)) 00528 { 00529 if (options3 & (1 << 1)) 00530 { 00531 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00532 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00533 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00534 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00535 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00536 } 00537 else 00538 { 00539 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00540 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00541 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00542 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00543 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00544 } 00545 } 00546 else 00547 { 00548 if (options3 & (1 << 1)) 00549 { 00550 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00551 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00552 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00553 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00554 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00555 } 00556 else 00557 { 00558 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00559 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00560 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00561 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00562 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00563 } 00564 } 00565 } 00566 00567 // alpha on GPU, beta on CPU 00568 template <typename T> 00569 __global__ void ambm_m_row_kernel( 00570 T * A, 00571 unsigned int A_start1, unsigned int A_start2, 00572 unsigned int A_inc1, unsigned int A_inc2, 00573 unsigned int A_size1, unsigned int A_size2, 00574 unsigned int A_internal_size1, unsigned int A_internal_size2, 00575 00576 const T * fac2, 00577 unsigned int options2, 00578 const T * B, 00579 unsigned int B_start1, unsigned int B_start2, 00580 unsigned int B_inc1, unsigned int B_inc2, 00581 unsigned int B_internal_size1, unsigned int B_internal_size2, 00582 00583 T fac3, 00584 unsigned int options3, 00585 const T * C, 00586 unsigned int C_start1, unsigned int C_start2, 00587 unsigned int C_inc1, unsigned int C_inc2, 00588 unsigned int C_internal_size1, unsigned int C_internal_size2) 00589 { 00590 T alpha = *fac2; 00591 if (options2 & (1 << 0)) 00592 alpha = -alpha; 00593 00594 T beta = fac3; 00595 if (options3 & (1 << 0)) 00596 beta = -beta; 00597 00598 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00599 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00600 00601 if (options2 & (1 << 1)) 00602 { 00603 if (options3 & (1 << 1)) 00604 { 00605 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00606 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00607 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00608 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00609 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00610 } 00611 else 00612 { 00613 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00614 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00615 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00616 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00617 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00618 } 00619 } 00620 else 00621 { 00622 if (options3 & (1 << 1)) 00623 { 00624 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00625 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00626 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00627 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00628 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00629 } 00630 else 00631 { 00632 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00633 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00634 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00635 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00636 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00637 } 00638 } 00639 } 00640 00641 00642 // alpha and beta on GPU 00643 template <typename T> 00644 __global__ void ambm_m_row_kernel( 00645 T * A, 00646 unsigned int A_start1, unsigned int A_start2, 00647 unsigned int A_inc1, unsigned int A_inc2, 00648 unsigned int A_size1, unsigned int A_size2, 00649 unsigned int A_internal_size1, unsigned int A_internal_size2, 00650 00651 const T * fac2, 00652 unsigned int options2, 00653 const T * B, 00654 unsigned int B_start1, unsigned int B_start2, 00655 unsigned int B_inc1, unsigned int B_inc2, 00656 unsigned int B_internal_size1, unsigned int B_internal_size2, 00657 00658 const T * fac3, 00659 unsigned int options3, 00660 const T * C, 00661 unsigned int C_start1, unsigned int C_start2, 00662 unsigned int C_inc1, unsigned int C_inc2, 00663 unsigned int C_internal_size1, unsigned int C_internal_size2) 00664 { 00665 T alpha = *fac2; 00666 if (options2 & (1 << 0)) 00667 alpha = -alpha; 00668 00669 T beta = *fac3; 00670 if (options3 & (1 << 0)) 00671 beta = -beta; 00672 00673 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00674 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00675 00676 if (options2 & (1 << 1)) 00677 { 00678 if (options3 & (1 << 1)) 00679 { 00680 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00681 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00682 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00683 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00684 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00685 } 00686 else 00687 { 00688 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00689 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00690 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00691 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] / alpha 00692 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00693 } 00694 } 00695 else 00696 { 00697 if (options3 & (1 << 1)) 00698 { 00699 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00700 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00701 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00702 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00703 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] / beta; 00704 } 00705 else 00706 { 00707 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00708 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00709 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00710 += B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] * alpha 00711 + C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2] * beta; 00712 } 00713 } 00714 } 00715 00716 // 00717 // assignments 00718 // 00719 00720 template <typename T> 00721 __global__ void matrix_row_assign_kernel( 00722 T * A, 00723 unsigned int A_start1, unsigned int A_start2, 00724 unsigned int A_inc1, unsigned int A_inc2, 00725 unsigned int A_size1, unsigned int A_size2, 00726 unsigned int A_internal_size1, unsigned int A_internal_size2, 00727 T alpha) 00728 { 00729 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00730 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00731 00732 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00733 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00734 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = alpha; 00735 } 00736 00737 00738 template <typename T> 00739 __global__ void matrix_row_diagonal_assign_kernel( 00740 T * A, 00741 unsigned int A_start1, unsigned int A_start2, 00742 unsigned int A_inc1, unsigned int A_inc2, 00743 unsigned int A_size1, unsigned int A_size2, 00744 unsigned int A_internal_size1, unsigned int A_internal_size2, 00745 T alpha) 00746 { 00747 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x); 00748 00749 for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x) 00750 A[(row * A_inc1 + A_start1) * A_internal_size2 + row * A_inc2 + A_start2] = alpha; 00751 } 00752 00753 // 00754 // binary element-wise operations 00755 // 00756 00757 template <typename T> 00758 __global__ void element_op_row_kernel( 00759 T * A, 00760 unsigned int A_start1, unsigned int A_start2, 00761 unsigned int A_inc1, unsigned int A_inc2, 00762 unsigned int A_size1, unsigned int A_size2, 00763 unsigned int A_internal_size1, unsigned int A_internal_size2, 00764 00765 const T * B, 00766 unsigned int B_start1, unsigned int B_start2, 00767 unsigned int B_inc1, unsigned int B_inc2, 00768 unsigned int B_internal_size1, unsigned int B_internal_size2, 00769 00770 const T * C, 00771 unsigned int C_start1, unsigned int C_start2, 00772 unsigned int C_inc1, unsigned int C_inc2, 00773 unsigned int C_internal_size1, unsigned int C_internal_size2, 00774 00775 unsigned int op_type) //0: product, 1: division, 2: pow 00776 { 00777 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00778 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00779 00780 if (op_type == 2) 00781 { 00782 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00783 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00784 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00785 = pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2], 00786 C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]); 00787 } 00788 else if (op_type == 1) 00789 { 00790 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00791 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00792 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00793 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] 00794 / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; 00795 } 00796 else if (op_type == 0) 00797 { 00798 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00799 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00800 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00801 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] 00802 * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; 00803 } 00804 } 00805 00806 template <typename T> 00807 __global__ void element_op_int_row_kernel( 00808 T * A, 00809 unsigned int A_start1, unsigned int A_start2, 00810 unsigned int A_inc1, unsigned int A_inc2, 00811 unsigned int A_size1, unsigned int A_size2, 00812 unsigned int A_internal_size1, unsigned int A_internal_size2, 00813 00814 const T * B, 00815 unsigned int B_start1, unsigned int B_start2, 00816 unsigned int B_inc1, unsigned int B_inc2, 00817 unsigned int B_internal_size1, unsigned int B_internal_size2, 00818 00819 const T * C, 00820 unsigned int C_start1, unsigned int C_start2, 00821 unsigned int C_inc1, unsigned int C_inc2, 00822 unsigned int C_internal_size1, unsigned int C_internal_size2, 00823 00824 unsigned int op_type) //0: product, 1: division, 2: pow 00825 { 00826 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00827 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00828 00829 if (op_type == 1) 00830 { 00831 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00832 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00833 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00834 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] 00835 / C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; 00836 } 00837 else if (op_type == 0) 00838 { 00839 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00840 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00841 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] 00842 = B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2] 00843 * C[(row * C_inc1 + C_start1) * C_internal_size2 + col * C_inc2 + C_start2]; 00844 } 00845 } 00846 00847 // 00848 // unary element-wise operations 00849 // 00850 00851 // abs 00852 template <typename T> 00853 __global__ void matrix_row_element_abs_kernel( 00854 T * A, 00855 unsigned int A_start1, unsigned int A_start2, 00856 unsigned int A_inc1, unsigned int A_inc2, 00857 unsigned int A_size1, unsigned int A_size2, 00858 unsigned int A_internal_size1, unsigned int A_internal_size2, 00859 00860 const T * B, 00861 unsigned int B_start1, unsigned int B_start2, 00862 unsigned int B_inc1, unsigned int B_inc2, 00863 unsigned int B_internal_size1, unsigned int B_internal_size2) 00864 { 00865 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00866 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00867 00868 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00869 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00870 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = abs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 00871 } 00872 00873 00874 // acos 00875 template <typename T> 00876 __global__ void matrix_row_element_acos_kernel( 00877 T * A, 00878 unsigned int A_start1, unsigned int A_start2, 00879 unsigned int A_inc1, unsigned int A_inc2, 00880 unsigned int A_size1, unsigned int A_size2, 00881 unsigned int A_internal_size1, unsigned int A_internal_size2, 00882 00883 const T * B, 00884 unsigned int B_start1, unsigned int B_start2, 00885 unsigned int B_inc1, unsigned int B_inc2, 00886 unsigned int B_internal_size1, unsigned int B_internal_size2) 00887 { 00888 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00889 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00890 00891 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00892 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00893 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = acos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 00894 } 00895 00896 00897 // asin 00898 template <typename T> 00899 __global__ void matrix_row_element_asin_kernel( 00900 T * A, 00901 unsigned int A_start1, unsigned int A_start2, 00902 unsigned int A_inc1, unsigned int A_inc2, 00903 unsigned int A_size1, unsigned int A_size2, 00904 unsigned int A_internal_size1, unsigned int A_internal_size2, 00905 00906 const T * B, 00907 unsigned int B_start1, unsigned int B_start2, 00908 unsigned int B_inc1, unsigned int B_inc2, 00909 unsigned int B_internal_size1, unsigned int B_internal_size2) 00910 { 00911 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00912 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00913 00914 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00915 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00916 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = asin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 00917 } 00918 00919 00920 // atan 00921 template <typename T> 00922 __global__ void matrix_row_element_atan_kernel( 00923 T * A, 00924 unsigned int A_start1, unsigned int A_start2, 00925 unsigned int A_inc1, unsigned int A_inc2, 00926 unsigned int A_size1, unsigned int A_size2, 00927 unsigned int A_internal_size1, unsigned int A_internal_size2, 00928 00929 const T * B, 00930 unsigned int B_start1, unsigned int B_start2, 00931 unsigned int B_inc1, unsigned int B_inc2, 00932 unsigned int B_internal_size1, unsigned int B_internal_size2) 00933 { 00934 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00935 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00936 00937 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00938 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00939 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = atan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 00940 } 00941 00942 00943 // ceil 00944 template <typename T> 00945 __global__ void matrix_row_element_ceil_kernel( 00946 T * A, 00947 unsigned int A_start1, unsigned int A_start2, 00948 unsigned int A_inc1, unsigned int A_inc2, 00949 unsigned int A_size1, unsigned int A_size2, 00950 unsigned int A_internal_size1, unsigned int A_internal_size2, 00951 00952 const T * B, 00953 unsigned int B_start1, unsigned int B_start2, 00954 unsigned int B_inc1, unsigned int B_inc2, 00955 unsigned int B_internal_size1, unsigned int B_internal_size2) 00956 { 00957 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00958 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00959 00960 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00961 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00962 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = ceil(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 00963 } 00964 00965 00966 // cos 00967 template <typename T> 00968 __global__ void matrix_row_element_cos_kernel( 00969 T * A, 00970 unsigned int A_start1, unsigned int A_start2, 00971 unsigned int A_inc1, unsigned int A_inc2, 00972 unsigned int A_size1, unsigned int A_size2, 00973 unsigned int A_internal_size1, unsigned int A_internal_size2, 00974 00975 const T * B, 00976 unsigned int B_start1, unsigned int B_start2, 00977 unsigned int B_inc1, unsigned int B_inc2, 00978 unsigned int B_internal_size1, unsigned int B_internal_size2) 00979 { 00980 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 00981 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 00982 00983 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 00984 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 00985 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cos(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 00986 } 00987 00988 00989 // cosh 00990 template <typename T> 00991 __global__ void matrix_row_element_cosh_kernel( 00992 T * A, 00993 unsigned int A_start1, unsigned int A_start2, 00994 unsigned int A_inc1, unsigned int A_inc2, 00995 unsigned int A_size1, unsigned int A_size2, 00996 unsigned int A_internal_size1, unsigned int A_internal_size2, 00997 00998 const T * B, 00999 unsigned int B_start1, unsigned int B_start2, 01000 unsigned int B_inc1, unsigned int B_inc2, 01001 unsigned int B_internal_size1, unsigned int B_internal_size2) 01002 { 01003 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01004 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01005 01006 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01007 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01008 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = cosh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01009 } 01010 01011 01012 // exp 01013 template <typename T> 01014 __global__ void matrix_row_element_exp_kernel( 01015 T * A, 01016 unsigned int A_start1, unsigned int A_start2, 01017 unsigned int A_inc1, unsigned int A_inc2, 01018 unsigned int A_size1, unsigned int A_size2, 01019 unsigned int A_internal_size1, unsigned int A_internal_size2, 01020 01021 const T * B, 01022 unsigned int B_start1, unsigned int B_start2, 01023 unsigned int B_inc1, unsigned int B_inc2, 01024 unsigned int B_internal_size1, unsigned int B_internal_size2) 01025 { 01026 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01027 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01028 01029 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01030 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01031 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = exp(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01032 } 01033 01034 01035 // fabs 01036 template <typename T> 01037 __global__ void matrix_row_element_fabs_kernel( 01038 T * A, 01039 unsigned int A_start1, unsigned int A_start2, 01040 unsigned int A_inc1, unsigned int A_inc2, 01041 unsigned int A_size1, unsigned int A_size2, 01042 unsigned int A_internal_size1, unsigned int A_internal_size2, 01043 01044 const T * B, 01045 unsigned int B_start1, unsigned int B_start2, 01046 unsigned int B_inc1, unsigned int B_inc2, 01047 unsigned int B_internal_size1, unsigned int B_internal_size2) 01048 { 01049 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01050 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01051 01052 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01053 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01054 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = fabs(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01055 } 01056 01057 01058 // floor 01059 template <typename T> 01060 __global__ void matrix_row_element_floor_kernel( 01061 T * A, 01062 unsigned int A_start1, unsigned int A_start2, 01063 unsigned int A_inc1, unsigned int A_inc2, 01064 unsigned int A_size1, unsigned int A_size2, 01065 unsigned int A_internal_size1, unsigned int A_internal_size2, 01066 01067 const T * B, 01068 unsigned int B_start1, unsigned int B_start2, 01069 unsigned int B_inc1, unsigned int B_inc2, 01070 unsigned int B_internal_size1, unsigned int B_internal_size2) 01071 { 01072 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01073 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01074 01075 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01076 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01077 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = floor(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01078 } 01079 01080 01081 // log 01082 template <typename T> 01083 __global__ void matrix_row_element_log_kernel( 01084 T * A, 01085 unsigned int A_start1, unsigned int A_start2, 01086 unsigned int A_inc1, unsigned int A_inc2, 01087 unsigned int A_size1, unsigned int A_size2, 01088 unsigned int A_internal_size1, unsigned int A_internal_size2, 01089 01090 const T * B, 01091 unsigned int B_start1, unsigned int B_start2, 01092 unsigned int B_inc1, unsigned int B_inc2, 01093 unsigned int B_internal_size1, unsigned int B_internal_size2) 01094 { 01095 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01096 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01097 01098 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01099 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01100 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01101 } 01102 01103 01104 // log10 01105 template <typename T> 01106 __global__ void matrix_row_element_log10_kernel( 01107 T * A, 01108 unsigned int A_start1, unsigned int A_start2, 01109 unsigned int A_inc1, unsigned int A_inc2, 01110 unsigned int A_size1, unsigned int A_size2, 01111 unsigned int A_internal_size1, unsigned int A_internal_size2, 01112 01113 const T * B, 01114 unsigned int B_start1, unsigned int B_start2, 01115 unsigned int B_inc1, unsigned int B_inc2, 01116 unsigned int B_internal_size1, unsigned int B_internal_size2) 01117 { 01118 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01119 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01120 01121 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01122 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01123 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = log10(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01124 } 01125 01126 01127 // sin 01128 template <typename T> 01129 __global__ void matrix_row_element_sin_kernel( 01130 T * A, 01131 unsigned int A_start1, unsigned int A_start2, 01132 unsigned int A_inc1, unsigned int A_inc2, 01133 unsigned int A_size1, unsigned int A_size2, 01134 unsigned int A_internal_size1, unsigned int A_internal_size2, 01135 01136 const T * B, 01137 unsigned int B_start1, unsigned int B_start2, 01138 unsigned int B_inc1, unsigned int B_inc2, 01139 unsigned int B_internal_size1, unsigned int B_internal_size2) 01140 { 01141 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01142 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01143 01144 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01145 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01146 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sin(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01147 } 01148 01149 01150 // sinh 01151 template <typename T> 01152 __global__ void matrix_row_element_sinh_kernel( 01153 T * A, 01154 unsigned int A_start1, unsigned int A_start2, 01155 unsigned int A_inc1, unsigned int A_inc2, 01156 unsigned int A_size1, unsigned int A_size2, 01157 unsigned int A_internal_size1, unsigned int A_internal_size2, 01158 01159 const T * B, 01160 unsigned int B_start1, unsigned int B_start2, 01161 unsigned int B_inc1, unsigned int B_inc2, 01162 unsigned int B_internal_size1, unsigned int B_internal_size2) 01163 { 01164 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01165 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01166 01167 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01168 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01169 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sinh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01170 } 01171 01172 01173 // sqrt 01174 template <typename T> 01175 __global__ void matrix_row_element_sqrt_kernel( 01176 T * A, 01177 unsigned int A_start1, unsigned int A_start2, 01178 unsigned int A_inc1, unsigned int A_inc2, 01179 unsigned int A_size1, unsigned int A_size2, 01180 unsigned int A_internal_size1, unsigned int A_internal_size2, 01181 01182 const T * B, 01183 unsigned int B_start1, unsigned int B_start2, 01184 unsigned int B_inc1, unsigned int B_inc2, 01185 unsigned int B_internal_size1, unsigned int B_internal_size2) 01186 { 01187 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01188 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01189 01190 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01191 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01192 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = sqrt(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01193 } 01194 01195 01196 // tan 01197 template <typename T> 01198 __global__ void matrix_row_element_tan_kernel( 01199 T * A, 01200 unsigned int A_start1, unsigned int A_start2, 01201 unsigned int A_inc1, unsigned int A_inc2, 01202 unsigned int A_size1, unsigned int A_size2, 01203 unsigned int A_internal_size1, unsigned int A_internal_size2, 01204 01205 const T * B, 01206 unsigned int B_start1, unsigned int B_start2, 01207 unsigned int B_inc1, unsigned int B_inc2, 01208 unsigned int B_internal_size1, unsigned int B_internal_size2) 01209 { 01210 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01211 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01212 01213 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01214 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01215 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tan(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01216 } 01217 01218 01219 // tanh 01220 template <typename T> 01221 __global__ void matrix_row_element_tanh_kernel( 01222 T * A, 01223 unsigned int A_start1, unsigned int A_start2, 01224 unsigned int A_inc1, unsigned int A_inc2, 01225 unsigned int A_size1, unsigned int A_size2, 01226 unsigned int A_internal_size1, unsigned int A_internal_size2, 01227 01228 const T * B, 01229 unsigned int B_start1, unsigned int B_start2, 01230 unsigned int B_inc1, unsigned int B_inc2, 01231 unsigned int B_internal_size1, unsigned int B_internal_size2) 01232 { 01233 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01234 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01235 01236 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01237 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01238 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] = tanh(B[(row * B_inc1 + B_start1) * B_internal_size2 + col * B_inc2 + B_start2]); 01239 } 01240 01241 01242 01243 // 01244 // matrix-vector product 01245 // 01246 01247 template <typename T> 01248 __global__ void vec_mul_row_kernel( 01249 const T * A, 01250 unsigned int A_row_start, 01251 unsigned int A_col_start, 01252 unsigned int A_row_inc, 01253 unsigned int A_col_inc, 01254 unsigned int A_row_size, 01255 unsigned int A_col_size, 01256 unsigned int A_internal_rows, 01257 unsigned int A_internal_cols, 01258 const T * v, 01259 unsigned int v_start, 01260 unsigned int v_inc, 01261 unsigned int v_size, 01262 T * result, 01263 unsigned int result_start, 01264 unsigned int result_inc, 01265 unsigned int result_size) 01266 { 01267 __shared__ T work[128]; 01268 01269 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01270 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01271 unsigned int lid = threadIdx.x; 01272 01273 for (unsigned int row = row_gid; row < A_row_size; row += gridDim.x) 01274 { 01275 T dot_prod = 0; 01276 for (unsigned int col = col_gid; col < A_col_size; col += blockDim.x) 01277 dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; 01278 work[lid] = dot_prod; 01279 01280 for(unsigned int stride = blockDim.x/2 ; stride>0 ; stride>>=1){ 01281 __syncthreads(); 01282 if(lid < stride) 01283 work[lid] += work[lid+stride]; 01284 } 01285 01286 if(lid == 0) 01287 result[row * result_inc + result_start] = work[0]; 01288 } 01289 } 01290 01291 01292 template <typename T> 01293 __global__ void trans_vec_mul_row_kernel( 01294 const T * A, 01295 unsigned int A_row_start, 01296 unsigned int A_col_start, 01297 unsigned int A_row_inc, 01298 unsigned int A_col_inc, 01299 unsigned int A_row_size, 01300 unsigned int A_col_size, 01301 unsigned int A_internal_rows, 01302 unsigned int A_internal_cols, 01303 const T * v, 01304 unsigned int v_start, 01305 unsigned int v_inc, 01306 unsigned int v_size, 01307 T * result, 01308 unsigned int result_start, 01309 unsigned int result_inc, 01310 unsigned int result_size) 01311 { 01312 for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_col_size; row += gridDim.x * blockDim.x) 01313 { 01314 T dot_prod = 0; 01315 for (unsigned int col = 0; col < A_row_size; ++col) 01316 dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; 01317 result[row * result_inc + result_start] = dot_prod; 01318 } 01319 } 01320 01321 01322 // 01323 // matrix-matrix products 01324 // 01325 01326 01327 01328 01329 // 01330 // scaled rank-1-update 01331 // 01332 01333 // alpha on CPU 01334 template <typename T> 01335 __global__ void scaled_rank1_update_row_kernel( 01336 T * A, 01337 unsigned int A_start1, unsigned int A_start2, 01338 unsigned int A_inc1, unsigned int A_inc2, 01339 unsigned int A_size1, unsigned int A_size2, 01340 unsigned int A_internal_size1, unsigned int A_internal_size2, 01341 01342 T val, 01343 unsigned int options2, 01344 01345 const T * vec1, 01346 unsigned int start1, 01347 unsigned int inc1, 01348 unsigned int size1, 01349 01350 const T * vec2, 01351 unsigned int start2, 01352 unsigned int inc2, 01353 unsigned int size2) 01354 { 01355 T alpha = val; 01356 if (options2 & (1 << 0)) 01357 alpha = -alpha; 01358 if (options2 & (1 << 1)) 01359 alpha = ((T)(1)) / alpha; 01360 01361 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01362 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01363 01364 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01365 { 01366 T tmp = alpha * vec1[row * inc1 + start1]; 01367 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01368 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; 01369 } 01370 } 01371 01372 01373 // alpha on GPU 01374 template <typename T> 01375 __global__ void scaled_rank1_update_row_kernel( 01376 T * A, 01377 unsigned int A_start1, unsigned int A_start2, 01378 unsigned int A_inc1, unsigned int A_inc2, 01379 unsigned int A_size1, unsigned int A_size2, 01380 unsigned int A_internal_size1, unsigned int A_internal_size2, 01381 01382 const T * val, 01383 unsigned int options2, 01384 01385 const T * vec1, 01386 unsigned int start1, 01387 unsigned int inc1, 01388 unsigned int size1, 01389 01390 const T * vec2, 01391 unsigned int start2, 01392 unsigned int inc2, 01393 unsigned int size2) 01394 { 01395 T alpha = *val; 01396 if (options2 & (1 << 0)) 01397 alpha = -alpha; 01398 if (options2 & (1 << 1)) 01399 alpha = ((T)(1)) / alpha; 01400 01401 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x; 01402 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x; 01403 01404 for (unsigned int row = row_gid; row < A_size1; row += gridDim.x) 01405 { 01406 T tmp = alpha * vec1[row * inc1 + start1]; 01407 for (unsigned int col = col_gid; col < A_size2; col += blockDim.x) 01408 A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; 01409 } 01410 } 01411 01412 01413 01414 } // namespace cuda 01415 } //namespace linalg 01416 } //namespace viennacl 01417 01418 01419 #endif