1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
37 template<
typename NumericT>
39 unsigned int A_start1,
unsigned int A_start2,
40 unsigned int A_inc1,
unsigned int A_inc2,
41 unsigned int A_size1,
unsigned int A_size2,
42 unsigned int A_internal_size1,
unsigned int A_internal_size2,
45 unsigned int options2,
47 unsigned int B_start1,
unsigned int B_start2,
48 unsigned int B_inc1,
unsigned int B_inc2,
49 unsigned int B_internal_size1,
unsigned int B_internal_size2)
51 NumericT alpha = fac2;
52 if (options2 & (1 << 0))
55 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
56 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
58 if (options2 & (1 << 1))
60 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
61 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
62 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
66 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
67 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
68 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
73 template<
typename NumericT>
75 unsigned int A_start1,
unsigned int A_start2,
76 unsigned int A_inc1,
unsigned int A_inc2,
77 unsigned int A_size1,
unsigned int A_size2,
78 unsigned int A_internal_size1,
unsigned int A_internal_size2,
80 const NumericT * fac2,
81 unsigned int options2,
83 unsigned int B_start1,
unsigned int B_start2,
84 unsigned int B_inc1,
unsigned int B_inc2,
85 unsigned int B_internal_size1,
unsigned int B_internal_size2)
87 NumericT alpha = *fac2;
88 if (options2 & (1 << 0))
91 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
92 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
94 if (options2 & (1 << 1))
96 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
97 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
98 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
102 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
103 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
104 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
114 template<
typename NumericT>
116 unsigned int A_start1,
unsigned int A_start2,
117 unsigned int A_inc1,
unsigned int A_inc2,
118 unsigned int A_size1,
unsigned int A_size2,
119 unsigned int A_internal_size1,
unsigned int A_internal_size2,
122 unsigned int options2,
124 unsigned int B_start1,
unsigned int B_start2,
125 unsigned int B_inc1,
unsigned int B_inc2,
126 unsigned int B_internal_size1,
unsigned int B_internal_size2,
129 unsigned int options3,
131 unsigned int C_start1,
unsigned int C_start2,
132 unsigned int C_inc1,
unsigned int C_inc2,
133 unsigned int C_internal_size1,
unsigned int C_internal_size2)
135 NumericT alpha = fac2;
136 if (options2 & (1 << 0))
139 NumericT beta = fac3;
140 if (options3 & (1 << 0))
143 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
144 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
146 if (options2 & (1 << 1))
148 if (options3 & (1 << 1))
150 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
151 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
152 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
153 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
154 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
158 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
159 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
160 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
161 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
162 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
167 if (options3 & (1 << 1))
169 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
170 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
171 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
172 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
173 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
177 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
178 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
179 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
180 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
181 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
188 template<
typename NumericT>
190 unsigned int A_start1,
unsigned int A_start2,
191 unsigned int A_inc1,
unsigned int A_inc2,
192 unsigned int A_size1,
unsigned int A_size2,
193 unsigned int A_internal_size1,
unsigned int A_internal_size2,
196 unsigned int options2,
198 unsigned int B_start1,
unsigned int B_start2,
199 unsigned int B_inc1,
unsigned int B_inc2,
200 unsigned int B_internal_size1,
unsigned int B_internal_size2,
202 const NumericT * fac3,
203 unsigned int options3,
205 unsigned int C_start1,
unsigned int C_start2,
206 unsigned int C_inc1,
unsigned int C_inc2,
207 unsigned int C_internal_size1,
unsigned int C_internal_size2)
209 NumericT alpha = fac2;
210 if (options2 & (1 << 0))
213 NumericT beta = *fac3;
214 if (options3 & (1 << 0))
217 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
218 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
220 if (options2 & (1 << 1))
222 if (options3 & (1 << 1))
224 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
225 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
226 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
227 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
228 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
232 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
233 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
234 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
235 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
236 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
241 if (options3 & (1 << 1))
243 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
244 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
245 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
246 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
247 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
251 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
252 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
253 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
254 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
255 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
261 template<
typename NumericT>
263 unsigned int A_start1,
unsigned int A_start2,
264 unsigned int A_inc1,
unsigned int A_inc2,
265 unsigned int A_size1,
unsigned int A_size2,
266 unsigned int A_internal_size1,
unsigned int A_internal_size2,
268 const NumericT * fac2,
269 unsigned int options2,
271 unsigned int B_start1,
unsigned int B_start2,
272 unsigned int B_inc1,
unsigned int B_inc2,
273 unsigned int B_internal_size1,
unsigned int B_internal_size2,
276 unsigned int options3,
278 unsigned int C_start1,
unsigned int C_start2,
279 unsigned int C_inc1,
unsigned int C_inc2,
280 unsigned int C_internal_size1,
unsigned int C_internal_size2)
282 NumericT alpha = *fac2;
283 if (options2 & (1 << 0))
286 NumericT beta = fac3;
287 if (options3 & (1 << 0))
290 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
291 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
293 if (options2 & (1 << 1))
295 if (options3 & (1 << 1))
297 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
298 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
299 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
300 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
301 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
305 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
306 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
307 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
308 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
309 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
314 if (options3 & (1 << 1))
316 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
317 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
318 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
319 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
320 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
324 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
325 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
326 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
327 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
328 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
335 template<
typename NumericT>
338 unsigned int A_start1,
unsigned int A_start2,
339 unsigned int A_inc1,
unsigned int A_inc2,
340 unsigned int A_size1,
unsigned int A_size2,
341 unsigned int A_internal_size1,
unsigned int A_internal_size2,
343 const NumericT * fac2,
344 unsigned int options2,
346 unsigned int B_start1,
unsigned int B_start2,
347 unsigned int B_inc1,
unsigned int B_inc2,
348 unsigned int B_internal_size1,
unsigned int B_internal_size2,
350 const NumericT * fac3,
351 unsigned int options3,
353 unsigned int C_start1,
unsigned int C_start2,
354 unsigned int C_inc1,
unsigned int C_inc2,
355 unsigned int C_internal_size1,
unsigned int C_internal_size2)
357 NumericT alpha = *fac2;
358 if (options2 & (1 << 0))
361 NumericT beta = *fac3;
362 if (options3 & (1 << 0))
365 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
366 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
368 if (options2 & (1 << 1))
370 if (options3 & (1 << 1))
372 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
373 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
374 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
375 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
376 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
380 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
381 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
382 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
383 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
384 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
389 if (options3 & (1 << 1))
391 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
392 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
393 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
394 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
395 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
399 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
400 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
401 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
402 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
403 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
414 template<
typename NumericT>
417 unsigned int A_start1,
unsigned int A_start2,
418 unsigned int A_inc1,
unsigned int A_inc2,
419 unsigned int A_size1,
unsigned int A_size2,
420 unsigned int A_internal_size1,
unsigned int A_internal_size2,
423 unsigned int options2,
425 unsigned int B_start1,
unsigned int B_start2,
426 unsigned int B_inc1,
unsigned int B_inc2,
427 unsigned int B_internal_size1,
unsigned int B_internal_size2,
430 unsigned int options3,
432 unsigned int C_start1,
unsigned int C_start2,
433 unsigned int C_inc1,
unsigned int C_inc2,
434 unsigned int C_internal_size1,
unsigned int C_internal_size2)
436 NumericT alpha = fac2;
437 if (options2 & (1 << 0))
440 NumericT beta = fac3;
441 if (options3 & (1 << 0))
444 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
445 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
447 if (options2 & (1 << 1))
449 if (options3 & (1 << 1))
451 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
452 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
453 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
454 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
455 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
459 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
460 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
461 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
462 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
463 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
468 if (options3 & (1 << 1))
470 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
471 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
472 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
473 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
474 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
478 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
479 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
480 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
481 += B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
482 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
489 template<
typename NumericT>
492 unsigned int A_start1,
unsigned int A_start2,
493 unsigned int A_inc1,
unsigned int A_inc2,
494 unsigned int A_size1,
unsigned int A_size2,
495 unsigned int A_internal_size1,
unsigned int A_internal_size2,
498 unsigned int options2,
500 unsigned int B_start1,
unsigned int B_start2,
501 unsigned int B_inc1,
unsigned int B_inc2,
502 unsigned int B_internal_size1,
unsigned int B_internal_size2,
504 const NumericT * fac3,
505 unsigned int options3,
507 unsigned int C_start1,
unsigned int C_start2,
508 unsigned int C_inc1,
unsigned int C_inc2,
509 unsigned int C_internal_size1,
unsigned int C_internal_size2)
511 NumericT alpha = fac2;
512 if (options2 & (1 << 0))
515 NumericT beta = *fac3;
516 if (options3 & (1 << 0))
519 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
520 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
522 if (options2 & (1 << 1))
524 if (options3 & (1 << 1))
526 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
527 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
528 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
529 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
530 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
534 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
535 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
536 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
537 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
538 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
543 if (options3 & (1 << 1))
545 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
546 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
547 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
548 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
549 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
553 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
554 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
555 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
556 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
557 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
563 template<
typename NumericT>
566 unsigned int A_start1,
unsigned int A_start2,
567 unsigned int A_inc1,
unsigned int A_inc2,
568 unsigned int A_size1,
unsigned int A_size2,
569 unsigned int A_internal_size1,
unsigned int A_internal_size2,
571 const NumericT * fac2,
572 unsigned int options2,
574 unsigned int B_start1,
unsigned int B_start2,
575 unsigned int B_inc1,
unsigned int B_inc2,
576 unsigned int B_internal_size1,
unsigned int B_internal_size2,
579 unsigned int options3,
581 unsigned int C_start1,
unsigned int C_start2,
582 unsigned int C_inc1,
unsigned int C_inc2,
583 unsigned int C_internal_size1,
unsigned int C_internal_size2)
585 NumericT alpha = *fac2;
586 if (options2 & (1 << 0))
589 NumericT beta = fac3;
590 if (options3 & (1 << 0))
593 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
594 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
596 if (options2 & (1 << 1))
598 if (options3 & (1 << 1))
600 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
601 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
602 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
603 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
604 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
608 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
609 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
610 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
611 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
612 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
617 if (options3 & (1 << 1))
619 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
620 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
621 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
622 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
623 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
627 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
628 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
629 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
630 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
631 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
638 template<
typename NumericT>
641 unsigned int A_start1,
unsigned int A_start2,
642 unsigned int A_inc1,
unsigned int A_inc2,
643 unsigned int A_size1,
unsigned int A_size2,
644 unsigned int A_internal_size1,
unsigned int A_internal_size2,
646 const NumericT * fac2,
647 unsigned int options2,
649 unsigned int B_start1,
unsigned int B_start2,
650 unsigned int B_inc1,
unsigned int B_inc2,
651 unsigned int B_internal_size1,
unsigned int B_internal_size2,
653 const NumericT * fac3,
654 unsigned int options3,
656 unsigned int C_start1,
unsigned int C_start2,
657 unsigned int C_inc1,
unsigned int C_inc2,
658 unsigned int C_internal_size1,
unsigned int C_internal_size2)
660 NumericT alpha = *fac2;
661 if (options2 & (1 << 0))
664 NumericT beta = *fac3;
665 if (options3 & (1 << 0))
668 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
669 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
671 if (options2 & (1 << 1))
673 if (options3 & (1 << 1))
675 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
676 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
677 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
678 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
679 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
683 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
684 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
685 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
686 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
687 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
692 if (options3 & (1 << 1))
694 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
695 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
696 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
697 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
698 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
702 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
703 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
704 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
705 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
706 + C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
717 template<
typename NumericT>
720 unsigned int A_start1,
unsigned int A_start2,
721 unsigned int A_inc1,
unsigned int A_inc2,
722 unsigned int A_size1,
unsigned int A_size2,
723 unsigned int A_internal_size1,
unsigned int A_internal_size2,
726 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
727 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
729 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
730 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
731 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
735 template<
typename NumericT>
738 unsigned int A_start1,
unsigned int A_start2,
739 unsigned int A_inc1,
unsigned int A_inc2,
740 unsigned int A_size1,
unsigned int A_size2,
741 unsigned int A_internal_size1,
unsigned int A_internal_size2,
744 unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
746 for (
unsigned int row = gid;
row < A_size1;
row += blockDim.x * gridDim.x)
747 A[(
row * A_inc1 + A_start1) + (
row * A_inc2 + A_start2) * A_internal_size1] = alpha;
754 template<
typename NumericT>
757 unsigned int A_start1,
unsigned int A_start2,
758 unsigned int A_inc1,
unsigned int A_inc2,
759 unsigned int A_size1,
unsigned int A_size2,
760 unsigned int A_internal_size1,
unsigned int A_internal_size2,
763 unsigned int B_start1,
unsigned int B_start2,
764 unsigned int B_inc1,
unsigned int B_inc2,
765 unsigned int B_internal_size1,
unsigned int B_internal_size2,
768 unsigned int C_start1,
unsigned int C_start2,
769 unsigned int C_inc1,
unsigned int C_inc2,
770 unsigned int C_internal_size1,
unsigned int C_internal_size2,
772 unsigned int op_type)
774 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
775 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
779 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
780 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
781 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
782 = pow(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
783 C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
785 else if (op_type == 1)
787 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
788 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
789 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
790 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
791 / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
793 else if (op_type == 0)
795 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
796 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
797 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
798 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
799 * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
803 template<
typename NumericT>
806 unsigned int A_start1,
unsigned int A_start2,
807 unsigned int A_inc1,
unsigned int A_inc2,
808 unsigned int A_size1,
unsigned int A_size2,
809 unsigned int A_internal_size1,
unsigned int A_internal_size2,
812 unsigned int B_start1,
unsigned int B_start2,
813 unsigned int B_inc1,
unsigned int B_inc2,
814 unsigned int B_internal_size1,
unsigned int B_internal_size2,
817 unsigned int C_start1,
unsigned int C_start2,
818 unsigned int C_inc1,
unsigned int C_inc2,
819 unsigned int C_internal_size1,
unsigned int C_internal_size2,
821 unsigned int op_type)
823 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
824 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
828 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
829 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
830 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
831 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
832 / C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
834 else if (op_type == 0)
836 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
837 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
838 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
839 = B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
840 * C[(
row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
850 template<
typename NumericT>
853 unsigned int A_start1,
unsigned int A_start2,
854 unsigned int A_inc1,
unsigned int A_inc2,
855 unsigned int A_size1,
unsigned int A_size2,
856 unsigned int A_internal_size1,
unsigned int A_internal_size2,
859 unsigned int B_start1,
unsigned int B_start2,
860 unsigned int B_inc1,
unsigned int B_inc2,
861 unsigned int B_internal_size1,
unsigned int B_internal_size2)
863 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
864 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
866 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
867 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
868 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
873 template<
typename NumericT>
876 unsigned int A_start1,
unsigned int A_start2,
877 unsigned int A_inc1,
unsigned int A_inc2,
878 unsigned int A_size1,
unsigned int A_size2,
879 unsigned int A_internal_size1,
unsigned int A_internal_size2,
882 unsigned int B_start1,
unsigned int B_start2,
883 unsigned int B_inc1,
unsigned int B_inc2,
884 unsigned int B_internal_size1,
unsigned int B_internal_size2)
886 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
887 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
889 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
890 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
891 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
896 template<
typename NumericT>
899 unsigned int A_start1,
unsigned int A_start2,
900 unsigned int A_inc1,
unsigned int A_inc2,
901 unsigned int A_size1,
unsigned int A_size2,
902 unsigned int A_internal_size1,
unsigned int A_internal_size2,
905 unsigned int B_start1,
unsigned int B_start2,
906 unsigned int B_inc1,
unsigned int B_inc2,
907 unsigned int B_internal_size1,
unsigned int B_internal_size2)
909 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
910 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
912 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
913 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
914 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
919 template<
typename NumericT>
922 unsigned int A_start1,
unsigned int A_start2,
923 unsigned int A_inc1,
unsigned int A_inc2,
924 unsigned int A_size1,
unsigned int A_size2,
925 unsigned int A_internal_size1,
unsigned int A_internal_size2,
928 unsigned int B_start1,
unsigned int B_start2,
929 unsigned int B_inc1,
unsigned int B_inc2,
930 unsigned int B_internal_size1,
unsigned int B_internal_size2)
932 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
933 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
935 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
936 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
937 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
942 template<
typename NumericT>
945 unsigned int A_start1,
unsigned int A_start2,
946 unsigned int A_inc1,
unsigned int A_inc2,
947 unsigned int A_size1,
unsigned int A_size2,
948 unsigned int A_internal_size1,
unsigned int A_internal_size2,
951 unsigned int B_start1,
unsigned int B_start2,
952 unsigned int B_inc1,
unsigned int B_inc2,
953 unsigned int B_internal_size1,
unsigned int B_internal_size2)
955 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
956 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
958 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
959 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
960 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
965 template<
typename NumericT>
968 unsigned int A_start1,
unsigned int A_start2,
969 unsigned int A_inc1,
unsigned int A_inc2,
970 unsigned int A_size1,
unsigned int A_size2,
971 unsigned int A_internal_size1,
unsigned int A_internal_size2,
974 unsigned int B_start1,
unsigned int B_start2,
975 unsigned int B_inc1,
unsigned int B_inc2,
976 unsigned int B_internal_size1,
unsigned int B_internal_size2)
978 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
979 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
981 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
982 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
983 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
988 template<
typename NumericT>
991 unsigned int A_start1,
unsigned int A_start2,
992 unsigned int A_inc1,
unsigned int A_inc2,
993 unsigned int A_size1,
unsigned int A_size2,
994 unsigned int A_internal_size1,
unsigned int A_internal_size2,
997 unsigned int B_start1,
unsigned int B_start2,
998 unsigned int B_inc1,
unsigned int B_inc2,
999 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1001 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1002 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1004 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1005 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1006 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1011 template<
typename NumericT>
1014 unsigned int A_start1,
unsigned int A_start2,
1015 unsigned int A_inc1,
unsigned int A_inc2,
1016 unsigned int A_size1,
unsigned int A_size2,
1017 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1020 unsigned int B_start1,
unsigned int B_start2,
1021 unsigned int B_inc1,
unsigned int B_inc2,
1022 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1024 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1025 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1027 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1028 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1029 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1034 template<
typename NumericT>
1037 unsigned int A_start1,
unsigned int A_start2,
1038 unsigned int A_inc1,
unsigned int A_inc2,
1039 unsigned int A_size1,
unsigned int A_size2,
1040 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1043 unsigned int B_start1,
unsigned int B_start2,
1044 unsigned int B_inc1,
unsigned int B_inc2,
1045 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1047 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1048 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1050 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1051 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1052 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1057 template<
typename NumericT>
1060 unsigned int A_start1,
unsigned int A_start2,
1061 unsigned int A_inc1,
unsigned int A_inc2,
1062 unsigned int A_size1,
unsigned int A_size2,
1063 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1066 unsigned int B_start1,
unsigned int B_start2,
1067 unsigned int B_inc1,
unsigned int B_inc2,
1068 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1070 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1071 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1073 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1074 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1075 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1080 template<
typename NumericT>
1083 unsigned int A_start1,
unsigned int A_start2,
1084 unsigned int A_inc1,
unsigned int A_inc2,
1085 unsigned int A_size1,
unsigned int A_size2,
1086 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1089 unsigned int B_start1,
unsigned int B_start2,
1090 unsigned int B_inc1,
unsigned int B_inc2,
1091 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1093 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1094 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1096 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1097 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1098 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1103 template<
typename NumericT>
1106 unsigned int A_start1,
unsigned int A_start2,
1107 unsigned int A_inc1,
unsigned int A_inc2,
1108 unsigned int A_size1,
unsigned int A_size2,
1109 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1112 unsigned int B_start1,
unsigned int B_start2,
1113 unsigned int B_inc1,
unsigned int B_inc2,
1114 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1116 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1117 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1119 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1120 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1121 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1126 template<
typename NumericT>
1129 unsigned int A_start1,
unsigned int A_start2,
1130 unsigned int A_inc1,
unsigned int A_inc2,
1131 unsigned int A_size1,
unsigned int A_size2,
1132 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1135 unsigned int B_start1,
unsigned int B_start2,
1136 unsigned int B_inc1,
unsigned int B_inc2,
1137 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1139 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1140 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1142 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1143 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1144 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1149 template<
typename NumericT>
1152 unsigned int A_start1,
unsigned int A_start2,
1153 unsigned int A_inc1,
unsigned int A_inc2,
1154 unsigned int A_size1,
unsigned int A_size2,
1155 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1158 unsigned int B_start1,
unsigned int B_start2,
1159 unsigned int B_inc1,
unsigned int B_inc2,
1160 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1162 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1163 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1165 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1166 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1167 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1172 template<
typename NumericT>
1175 unsigned int A_start1,
unsigned int A_start2,
1176 unsigned int A_inc1,
unsigned int A_inc2,
1177 unsigned int A_size1,
unsigned int A_size2,
1178 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1181 unsigned int B_start1,
unsigned int B_start2,
1182 unsigned int B_inc1,
unsigned int B_inc2,
1183 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1185 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1186 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1188 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1189 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1190 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1195 template<
typename NumericT>
1198 unsigned int A_start1,
unsigned int A_start2,
1199 unsigned int A_inc1,
unsigned int A_inc2,
1200 unsigned int A_size1,
unsigned int A_size2,
1201 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1204 unsigned int B_start1,
unsigned int B_start2,
1205 unsigned int B_inc1,
unsigned int B_inc2,
1206 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1208 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1209 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1211 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1212 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1213 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1218 template<
typename NumericT>
1221 unsigned int A_start1,
unsigned int A_start2,
1222 unsigned int A_inc1,
unsigned int A_inc2,
1223 unsigned int A_size1,
unsigned int A_size2,
1224 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1227 unsigned int B_start1,
unsigned int B_start2,
1228 unsigned int B_inc1,
unsigned int B_inc2,
1229 unsigned int B_internal_size1,
unsigned int B_internal_size2)
1231 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1232 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1234 for (
unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1235 for (
unsigned int row = row_gid;
row < A_size1;
row += blockDim.x)
1236 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(
row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1245 template<
typename NumericT>
1248 unsigned int A_row_start,
1249 unsigned int A_col_start,
1250 unsigned int A_row_inc,
1251 unsigned int A_col_inc,
1252 unsigned int A_row_size,
1253 unsigned int A_col_size,
1254 unsigned int A_internal_rows,
1255 unsigned int A_internal_cols,
1257 unsigned int v_start,
1259 unsigned int v_size,
1261 unsigned int result_start,
1262 unsigned int result_inc,
1263 unsigned int result_size)
1266 for (
unsigned int row = blockIdx.x * blockDim.x + threadIdx.x;
row < A_row_size;
row += gridDim.x * blockDim.x)
1269 for (
unsigned int col = 0; col < A_col_size; ++col)
1270 dot_prod += A[(
row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
1271 result[
row * result_inc + result_start] =
dot_prod;
1276 template<
typename NumericT>
1279 unsigned int A_row_start,
1280 unsigned int A_col_start,
1281 unsigned int A_row_inc,
1282 unsigned int A_col_inc,
1283 unsigned int A_row_size,
1284 unsigned int A_col_size,
1285 unsigned int A_internal_rows,
1286 unsigned int A_internal_cols,
1288 unsigned int v_start,
1290 unsigned int v_size,
1292 unsigned int result_start,
1293 unsigned int result_inc,
1294 unsigned int result_size)
1296 __shared__ NumericT work[128];
1298 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1299 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1300 unsigned int lid = threadIdx.x;
1302 for (
unsigned int row = row_gid;
row < A_col_size;
row += gridDim.x)
1305 for (
unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
1306 dot_prod += A[(
row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
1312 work[lid] += work[lid+
stride];
1316 result[
row * result_inc + result_start] = work[0];
1333 template<
typename NumericT>
1336 unsigned int A_start1,
unsigned int A_start2,
1337 unsigned int A_inc1,
unsigned int A_inc2,
1338 unsigned int A_size1,
unsigned int A_size2,
1339 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1342 unsigned int options2,
1344 const NumericT * vec1,
1349 const NumericT * vec2,
1354 NumericT alpha = val;
1355 if (options2 & (1 << 0))
1357 if (options2 & (1 << 1))
1358 alpha = NumericT(1) / alpha;
1360 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1361 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1363 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1365 NumericT tmp = alpha * vec1[
row * inc1 +
start1];
1366 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1367 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1373 template<
typename NumericT>
1376 unsigned int A_start1,
unsigned int A_start2,
1377 unsigned int A_inc1,
unsigned int A_inc2,
1378 unsigned int A_size1,
unsigned int A_size2,
1379 unsigned int A_internal_size1,
unsigned int A_internal_size2,
1381 const NumericT * val,
1382 unsigned int options2,
1384 const NumericT * vec1,
1389 const NumericT * vec2,
1394 NumericT alpha = *val;
1395 if (options2 & (1 << 0))
1397 if (options2 & (1 << 1))
1398 alpha = NumericT(1) / alpha;
1400 unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1401 unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1403 for (
unsigned int row = row_gid;
row < A_size1;
row += gridDim.x)
1405 NumericT tmp = alpha * vec1[
row * inc1 +
start1];
1406 for (
unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1407 A[(
row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1412 template <
typename T>
1421 unsigned int size =
min(size1, size2);
1422 if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1425 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1427 i += gridDim.x * blockDim.x)
1429 D[i] = A[i*stride + i];
1430 S[i+1] = (i + 1 <
size2) ? A[i*stride + (i + 1)] : 0;
1434 template <
typename T>
1443 unsigned int size =
min(size1, size2);
1444 if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1447 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1449 i += gridDim.x * blockDim.x)
1451 D[i] = A[i*stride + i];
1452 S[i+1] = (i + 1 <
size2) ? A[i + (i + 1) *
stride] : 0;
1458 template<
typename T>
1462 unsigned int row_start,
1463 unsigned int col_start,
1467 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1468 unsigned int sz = gridDim.x * blockDim.x;
1470 for(
unsigned int i = row_start + x; i <
size; i += sz)
1472 V[i - row_start] = A[i * stride + col_start];
1476 template<
typename T>
1480 unsigned int row_start,
1481 unsigned int col_start,
1485 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1486 unsigned int sz = gridDim.x * blockDim.x;
1488 for(
unsigned int i = row_start + x; i <
size; i += sz)
1490 V[i - row_start] = A[i + col_start *
stride];
1494 template<
typename T>
1498 unsigned int row_start,
1499 unsigned int col_start,
1503 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1504 unsigned int sz = gridDim.x * blockDim.x;
1506 for(
unsigned int i = col_start + x; i <
size; i += sz)
1508 V[i - col_start] = A[row_start * stride + i];
1513 template<
typename T>
1517 unsigned int row_start,
1518 unsigned int col_start,
1522 unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1523 unsigned int sz = gridDim.x * blockDim.x;
1525 for(
unsigned int i = col_start + x; i <
size; i += sz)
1527 V[i - col_start] = A[row_start + i *
stride];
1534 template<
typename T>
1538 unsigned int row_start,
1539 unsigned int col_start,
1546 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1548 i += gridDim.x * blockDim.x)
1551 for(
unsigned int j = row_start; j <
size1; j++)
1552 ss = ss +(V[j] * A[j * stride + i]);
1554 for(
unsigned int j = row_start; j <
size1; j++)
1555 A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
1559 template<
typename T>
1563 unsigned int row_start,
1564 unsigned int col_start,
1571 for(
unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1573 i += gridDim.x * blockDim.x)
1576 for(
unsigned int j = row_start; j <
size1; j++)
1577 ss = ss +(V[j] * A[j + i * stride]);
1579 for(
unsigned int j = row_start; j <
size1; j++)
1580 A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
1586 template<
typename T>
1590 unsigned int row_start,
1591 unsigned int col_start,
1596 __shared__ T sums[128];
1599 for(
unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1602 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1603 ss = ss + (V[j] * A[i * stride + j]);
1604 sums[threadIdx.x] = ss;
1612 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1613 A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
1617 template<
typename T>
1621 unsigned int row_start,
1622 unsigned int col_start,
1627 __shared__ T sums[128];
1630 for(
unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1633 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1634 ss = ss + (V[j] * A[i + j *
stride]);
1635 sums[threadIdx.x] = ss;
1643 for(
unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1644 A[i + j *
stride] = A[i + j *
stride] - (2 * V[j] * sum_Av);
1650 template<
typename T>
1653 unsigned int th_Idx,
1654 unsigned int bl_Dim)
1656 unsigned int step = bl_Dim >> 1;
1661 sums[th_Idx] += sums[th_Idx +
step];
1668 template <
typename T>
1673 unsigned int strideQ)
1675 __shared__ T sums[128];
1677 for(
unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1680 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1681 ss = ss + (V[j] * QL[i * strideQ + j]);
1682 sums[threadIdx.x] = ss;
1690 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1691 QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
1695 template <
typename T>
1700 unsigned int strideQ)
1702 __shared__ T sums[128];
1704 for(
unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1707 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1708 ss = ss + (V[j] * QL[i + j * strideQ]);
1709 sums[threadIdx.x] = ss;
1717 for(
unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1718 QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
1723 template <
typename T>
1730 unsigned int start_i,
1733 unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1734 __shared__ T cs_lcl[256];
1735 __shared__ T ss_lcl[256];
1737 T x = (j <
size) ? matr[(end_i + 1) + j *
stride] : 0;
1739 unsigned int elems_num = end_i - start_i + 1;
1740 unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1742 for(
unsigned int block_id = 0; block_id < block_num; block_id++)
1744 unsigned int to =
min(elems_num - block_id * blockDim.x, blockDim.x);
1746 if(threadIdx.x < to)
1748 cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1749 ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1754 for(
unsigned int ind = 0; ind < to; ind++)
1756 unsigned int i = end_i - (ind + block_id * blockDim.x);
1757 T z = matr[i + j *
stride];
1758 T cs_val = cs_lcl[ind];
1759 T ss_val = ss_lcl[ind];
1760 matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
1761 x = -x * ss_val + z * cs_val;
1767 matr[(start_i) + j * stride] = x;
1770 template <
typename T>
1777 unsigned int start_i,
1780 unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1781 __shared__ T cs_lcl[256];
1782 __shared__ T ss_lcl[256];
1784 T x = (j <
size) ? matr[(end_i + 1) *stride + j] : 0;
1786 unsigned int elems_num = end_i - start_i + 1;
1787 unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1789 for(
unsigned int block_id = 0; block_id < block_num; block_id++)
1791 unsigned int to =
min(elems_num - block_id * blockDim.x, blockDim.x);
1793 if(threadIdx.x < to)
1795 cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1796 ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1801 for(
unsigned int ind = 0; ind < to; ind++)
1803 unsigned int i = end_i - (ind + block_id * blockDim.x);
1804 T z = matr[i *stride + j];
1805 T cs_val = cs_lcl[ind];
1806 T ss_val = ss_lcl[ind];
1807 matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
1808 x = -x * ss_val + z * cs_val;
1814 matr[(start_i) * stride + j] = x;
1820 #define VIENNACL_SECTION_SIZE 512
1821 template <
typename T>
1824 unsigned int startX,
1826 unsigned int InputSize,
1829 unsigned int startY,
1833 unsigned int startS,
1838 int i = blockIdx.x * blockDim.x + threadIdx.x;
1840 XY[threadIdx.x] = X[i * incX + startX];
1845 int index = (threadIdx.x + 1) * 2 *
stride - 1;
1846 if(index < blockDim.x)
1847 XY[index] += XY[index -
stride];
1853 int index = (threadIdx.x + 1) * 2 *
stride - 1;
1854 if(index +
stride < blockDim.x)
1855 XY[index +
stride] += XY[index];
1858 Y[i * incY + startY] = XY[threadIdx.x];
1860 if(threadIdx.x == 0)
1867 template <
typename T>
1870 unsigned int startX,
1872 unsigned int InputSize,
1875 unsigned int startY,
1879 unsigned int startS,
1884 int i = blockIdx.x * blockDim.x + threadIdx.x;
1885 if(i < InputSize + 1 && i != 0)
1886 XY[threadIdx.x] = X[(i - 1) * incX + startX];
1893 int index = (threadIdx.x + 1) * 2 *
stride - 1;
1894 if(index < blockDim.x)
1895 XY[index] += XY[index -
stride];
1901 int index = (threadIdx.x + 1) * 2 *
stride - 1;
1902 if(index +
stride < blockDim.x)
1903 XY[index +
stride] += XY[index];
1907 Y[i * incY + startY] = XY[threadIdx.x];
1910 if(threadIdx.x == 0)
1917 template <
typename T>
1920 unsigned int startS_ref,
1921 unsigned int incS_ref,
1924 unsigned int startS,
1926 unsigned int InputSize)
1932 int i = blockIdx.x * blockDim.x + threadIdx.x;
1934 XY[threadIdx.x] = S[i * incS + startS];
1939 int index = (threadIdx.x + 1) * 2 *
stride - 1;
1940 if(index < blockDim.x)
1941 XY[index] += XY[index -
stride];
1947 int index = (threadIdx.x + 1) * 2 *
stride - 1;
1948 if(index +
stride < blockDim.x)
1949 XY[index +
stride] += XY[index];
1954 S[i * incS + startS] = XY[threadIdx.x];
1955 S_ref[i * incS_ref + startS_ref] = XY[threadIdx.x];
1959 template <
typename T>
1962 unsigned int startS_ref,
1963 unsigned int incS_ref,
1966 unsigned int startS,
1970 int i = blockIdx.x * blockDim.x + threadIdx.x;
1971 for(
int j = 1; j <= blockIdx.x; j++)
1973 S[i * incS + startS] += S_ref[(j * blockDim.x - 1) * incS_ref + startS_ref];
1978 template <
typename T>
1981 unsigned int startS,
1985 unsigned int startY,
1987 unsigned int OutputSize)
1991 unsigned int i = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
1994 Y[i * incY + startY] += S[blockIdx.x * incS + startS];
1997 #undef VIENNACL_SECTION_SIZE
__global__ void matrix_col_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void house_update_A_right_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void scan_kernel_2(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS, unsigned int InputSize)
__global__ void copy_row_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
__global__ void house_update_A_left_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
result_of::size_type< T >::type start1(T const &obj)
__global__ void matrix_col_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void bidiag_pack_column_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void house_update_A_left_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
__global__ void matrix_col_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void matrix_col_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void scan_kernel_4(T *S, unsigned int startS, unsigned int incS, T *Y, unsigned int startY, unsigned int incY, unsigned int OutputSize)
__global__ void house_update_QL_row_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void scaled_rank1_update_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void matrix_col_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_m_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
__global__ void matrix_col_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void matrix_col_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
#define VIENNACL_SECTION_SIZE
__global__ void matrix_col_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void inclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
__global__ void matrix_col_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
__global__ void scan_kernel_3(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS)
__device__ void col_reduce_lcl_array(T *sums, unsigned int th_Idx, unsigned int bl_Dim)
__global__ void matrix_col_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void element_op_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void house_update_A_right_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void matrix_col_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void element_op_int_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_col_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void trans_vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void house_update_QL_column_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void exclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
__global__ void givens_next_column_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void bidiag_pack_row_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
NumericT min(std::vector< NumericT > const &v1)