ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
vector_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_VECTOR_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 #include "viennacl/forwards.h"
27 #include "viennacl/scalar.hpp"
28 #include "viennacl/tools/tools.hpp"
31 #include "viennacl/traits/size.hpp"
34 
35 namespace viennacl
36 {
37 namespace linalg
38 {
39 namespace cuda
40 {
41 
42 //
43 // Introductory note: By convention, all dimensions are already checked in the dispatcher frontend. No need to double-check again in here!
44 //
45 
46 
48 
49 // gpu scalar
50 template<typename NumericT>
51 __global__ void av_kernel(NumericT * vec1,
52  unsigned int start1,
53  unsigned int inc1,
54  unsigned int size1,
55 
56  const NumericT * fac2,
57  unsigned int options2,
58  const NumericT * vec2,
59  unsigned int start2,
60  unsigned int inc2)
61 {
62  NumericT alpha = *fac2;
63  if (options2 & (1 << 0))
64  alpha = -alpha;
65 
66  if (options2 & (1 << 1))
67  {
68  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
69  i < size1;
70  i += gridDim.x * blockDim.x)
71  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
72  }
73  else
74  {
75  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
76  i < size1;
77  i += gridDim.x * blockDim.x)
78  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
79  }
80 }
81 
82 // cpu scalar
83 template<typename NumericT>
84 __global__ void av_kernel(NumericT * vec1,
85  unsigned int start1,
86  unsigned int inc1,
87  unsigned int size1,
88 
89  NumericT fac2,
90  unsigned int options2,
91  const NumericT * vec2,
92  unsigned int start2,
93  unsigned int inc2)
94 {
95  NumericT alpha = fac2;
96  if (options2 & (1 << 0))
97  alpha = -alpha;
98 
99  if (options2 & (1 << 1))
100  {
101  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
102  i < size1;
103  i += gridDim.x * blockDim.x)
104  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha;
105  }
106  else
107  {
108  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
109  i < size1;
110  i += gridDim.x * blockDim.x)
111  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha;
112  }
113 }
114 
115 
116 
117 template<typename NumericT, typename ScalarType1>
119  vector_base<NumericT> const & vec2, ScalarType1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
120 {
121  typedef NumericT value_type;
122 
123  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
124 
125  value_type data_alpha = alpha;
126  if (flip_sign_alpha)
127  data_alpha = -data_alpha;
128  if (reciprocal_alpha)
129  data_alpha = static_cast<value_type>(1) / data_alpha;
130 
131  value_type temporary_alpha = 0;
133  temporary_alpha = alpha;
134 
135  av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
136  static_cast<unsigned int>(viennacl::traits::start(vec1)),
137  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
138  static_cast<unsigned int>(viennacl::traits::size(vec1)),
139 
140  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
141  options_alpha,
142  detail::cuda_arg<value_type>(vec2),
143  static_cast<unsigned int>(viennacl::traits::start(vec2)),
144  static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
145  VIENNACL_CUDA_LAST_ERROR_CHECK("av_kernel");
146 }
147 
148 
150 
151 // alpha and beta on GPU
152 template<typename NumericT>
153 __global__ void avbv_kernel(NumericT * vec1,
154  unsigned int start1,
155  unsigned int inc1,
156  unsigned int size1,
157 
158  const NumericT * fac2,
159  unsigned int options2,
160  const NumericT * vec2,
161  unsigned int start2,
162  unsigned int inc2,
163 
164  const NumericT * fac3,
165  unsigned int options3,
166  const NumericT * vec3,
167  unsigned int start3,
168  unsigned int inc3)
169 {
170  NumericT alpha = *fac2;
171  if (options2 & (1 << 0))
172  alpha = -alpha;
173 
174  NumericT beta = *fac3;
175  if (options3 & (1 << 0))
176  beta = -beta;
177 
178  if (options2 & (1 << 1))
179  {
180  if (options3 & (1 << 1))
181  {
182  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
183  i < size1;
184  i += gridDim.x * blockDim.x)
185  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
186  }
187  else
188  {
189  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
190  i < size1;
191  i += gridDim.x * blockDim.x)
192  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
193  }
194  }
195  else
196  {
197  if (options3 & (1 << 1))
198  {
199  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
200  i < size1;
201  i += gridDim.x * blockDim.x)
202  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
203  }
204  else
205  {
206  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
207  i < size1;
208  i += gridDim.x * blockDim.x)
209  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
210  }
211  }
212 }
213 
214 // alpha on CPU, beta on GPU
215 template<typename NumericT>
216 __global__ void avbv_kernel(NumericT * vec1,
217  unsigned int start1,
218  unsigned int inc1,
219  unsigned int size1,
220 
221  NumericT fac2,
222  unsigned int options2,
223  const NumericT * vec2,
224  unsigned int start2,
225  unsigned int inc2,
226 
227  const NumericT * fac3,
228  unsigned int options3,
229  const NumericT * vec3,
230  unsigned int start3,
231  unsigned int inc3)
232 {
233  NumericT alpha = fac2;
234  if (options2 & (1 << 0))
235  alpha = -alpha;
236 
237  NumericT beta = *fac3;
238  if (options3 & (1 << 0))
239  beta = -beta;
240 
241  if (options2 & (1 << 1))
242  {
243  if (options3 & (1 << 1))
244  {
245  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
246  i < size1;
247  i += gridDim.x * blockDim.x)
248  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
249  }
250  else
251  {
252  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
253  i < size1;
254  i += gridDim.x * blockDim.x)
255  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
256  }
257  }
258  else
259  {
260  if (options3 & (1 << 1))
261  {
262  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
263  i < size1;
264  i += gridDim.x * blockDim.x)
265  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
266  }
267  else
268  {
269  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
270  i < size1;
271  i += gridDim.x * blockDim.x)
272  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
273  }
274  }
275 }
276 
277 // alpha on GPU, beta on CPU
278 template<typename NumericT>
279 __global__ void avbv_kernel(NumericT * vec1,
280  unsigned int start1,
281  unsigned int inc1,
282  unsigned int size1,
283 
284  const NumericT * fac2,
285  unsigned int options2,
286  const NumericT * vec2,
287  unsigned int start2,
288  unsigned int inc2,
289 
290  NumericT fac3,
291  unsigned int options3,
292  const NumericT * vec3,
293  unsigned int start3,
294  unsigned int inc3)
295 {
296  NumericT alpha = *fac2;
297  if (options2 & (1 << 0))
298  alpha = -alpha;
299 
300  NumericT beta = fac3;
301  if (options3 & (1 << 0))
302  beta = -beta;
303 
304  if (options2 & (1 << 1))
305  {
306  if (options3 & (1 << 1))
307  {
308  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
309  i < size1;
310  i += gridDim.x * blockDim.x)
311  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
312  }
313  else
314  {
315  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
316  i < size1;
317  i += gridDim.x * blockDim.x)
318  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
319  }
320  }
321  else
322  {
323  if (options3 & (1 << 1))
324  {
325  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
326  i < size1;
327  i += gridDim.x * blockDim.x)
328  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
329  }
330  else
331  {
332  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
333  i < size1;
334  i += gridDim.x * blockDim.x)
335  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
336  }
337  }
338 }
339 
340 // alpha and beta on CPU
341 template<typename NumericT>
342 __global__ void avbv_kernel(NumericT * vec1,
343  unsigned int start1,
344  unsigned int inc1,
345  unsigned int size1,
346 
347  NumericT fac2,
348  unsigned int options2,
349  const NumericT * vec2,
350  unsigned int start2,
351  unsigned int inc2,
352 
353  NumericT fac3,
354  unsigned int options3,
355  const NumericT * vec3,
356  unsigned int start3,
357  unsigned int inc3)
358 {
359  NumericT alpha = fac2;
360  if (options2 & (1 << 0))
361  alpha = -alpha;
362 
363  NumericT beta = fac3;
364  if (options3 & (1 << 0))
365  beta = -beta;
366 
367  if (options2 & (1 << 1))
368  {
369  if (options3 & (1 << 1))
370  {
371  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
372  i < size1;
373  i += gridDim.x * blockDim.x)
374  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
375  }
376  else
377  {
378  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
379  i < size1;
380  i += gridDim.x * blockDim.x)
381  vec1[i*inc1+start1] = vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
382  }
383  }
384  else
385  {
386  if (options3 & (1 << 1))
387  {
388  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
389  i < size1;
390  i += gridDim.x * blockDim.x)
391  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
392  }
393  else
394  {
395  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
396  i < size1;
397  i += gridDim.x * blockDim.x)
398  vec1[i*inc1+start1] = vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
399  }
400  }
401 }
402 
403 
404 
405 
406 template<typename NumericT, typename ScalarT1, typename ScalarT2>
408  vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
409  vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
410 {
411  typedef NumericT value_type;
412 
413  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
414 
415  value_type data_alpha = alpha;
416  if (flip_sign_alpha)
417  data_alpha = -data_alpha;
418  if (reciprocal_alpha)
419  data_alpha = static_cast<value_type>(1) / data_alpha;
420 
421  value_type temporary_alpha = 0;
423  temporary_alpha = alpha;
424 
425  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
426 
427  value_type temporary_beta = 0;
429  temporary_beta = beta;
430 
431 
432  avbv_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
433  static_cast<unsigned int>(viennacl::traits::start(vec1)),
434  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
435  static_cast<unsigned int>(viennacl::traits::size(vec1)),
436 
437  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
438  options_alpha,
439  detail::cuda_arg<value_type>(vec2),
440  static_cast<unsigned int>(viennacl::traits::start(vec2)),
441  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
442 
443  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
444  options_beta,
445  detail::cuda_arg<value_type>(vec3),
446  static_cast<unsigned int>(viennacl::traits::start(vec3)),
447  static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
448  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_kernel");
449 }
450 
451 
453 
454 
455 // alpha and beta on GPU
456 template<typename NumericT>
457 __global__ void avbv_v_kernel(NumericT * vec1,
458  unsigned int start1,
459  unsigned int inc1,
460  unsigned int size1,
461 
462  const NumericT * fac2,
463  unsigned int options2,
464  const NumericT * vec2,
465  unsigned int start2,
466  unsigned int inc2,
467 
468  const NumericT * fac3,
469  unsigned int options3,
470  const NumericT * vec3,
471  unsigned int start3,
472  unsigned int inc3)
473 {
474  NumericT alpha = *fac2;
475  if (options2 & (1 << 0))
476  alpha = -alpha;
477 
478  NumericT beta = *fac3;
479  if (options3 & (1 << 0))
480  beta = -beta;
481 
482  if (options2 & (1 << 1))
483  {
484  if (options3 & (1 << 1))
485  {
486  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
487  i < size1;
488  i += gridDim.x * blockDim.x)
489  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
490  }
491  else
492  {
493  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
494  i < size1;
495  i += gridDim.x * blockDim.x)
496  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
497  }
498  }
499  else
500  {
501  if (options3 & (1 << 1))
502  {
503  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
504  i < size1;
505  i += gridDim.x * blockDim.x)
506  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
507  }
508  else
509  {
510  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
511  i < size1;
512  i += gridDim.x * blockDim.x)
513  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
514  }
515  }
516 }
517 
518 // alpha on CPU, beta on GPU
519 template<typename NumericT>
520 __global__ void avbv_v_kernel(NumericT * vec1,
521  unsigned int start1,
522  unsigned int inc1,
523  unsigned int size1,
524 
525  NumericT fac2,
526  unsigned int options2,
527  const NumericT * vec2,
528  unsigned int start2,
529  unsigned int inc2,
530 
531  const NumericT * fac3,
532  unsigned int options3,
533  const NumericT * vec3,
534  unsigned int start3,
535  unsigned int inc3)
536 {
537  NumericT alpha = fac2;
538  if (options2 & (1 << 0))
539  alpha = -alpha;
540 
541  NumericT beta = *fac3;
542  if (options3 & (1 << 0))
543  beta = -beta;
544 
545  if (options2 & (1 << 1))
546  {
547  if (options3 & (1 << 1))
548  {
549  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
550  i < size1;
551  i += gridDim.x * blockDim.x)
552  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
553  }
554  else
555  {
556  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
557  i < size1;
558  i += gridDim.x * blockDim.x)
559  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
560  }
561  }
562  else
563  {
564  if (options3 & (1 << 1))
565  {
566  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
567  i < size1;
568  i += gridDim.x * blockDim.x)
569  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
570  }
571  else
572  {
573  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
574  i < size1;
575  i += gridDim.x * blockDim.x)
576  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
577  }
578  }
579 }
580 
581 // alpha on GPU, beta on CPU
582 template<typename NumericT>
583 __global__ void avbv_v_kernel(NumericT * vec1,
584  unsigned int start1,
585  unsigned int inc1,
586  unsigned int size1,
587 
588  const NumericT * fac2,
589  unsigned int options2,
590  const NumericT * vec2,
591  unsigned int start2,
592  unsigned int inc2,
593 
594  NumericT fac3,
595  unsigned int options3,
596  const NumericT * vec3,
597  unsigned int start3,
598  unsigned int inc3)
599 {
600  NumericT alpha = *fac2;
601  if (options2 & (1 << 0))
602  alpha = -alpha;
603 
604  NumericT beta = fac3;
605  if (options3 & (1 << 0))
606  beta = -beta;
607 
608  if (options2 & (1 << 1))
609  {
610  if (options3 & (1 << 1))
611  {
612  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
613  i < size1;
614  i += gridDim.x * blockDim.x)
615  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
616  }
617  else
618  {
619  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
620  i < size1;
621  i += gridDim.x * blockDim.x)
622  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
623  }
624  }
625  else
626  {
627  if (options3 & (1 << 1))
628  {
629  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
630  i < size1;
631  i += gridDim.x * blockDim.x)
632  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
633  }
634  else
635  {
636  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
637  i < size1;
638  i += gridDim.x * blockDim.x)
639  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
640  }
641  }
642 }
643 
644 // alpha and beta on CPU
645 template<typename NumericT>
646 __global__ void avbv_v_kernel(NumericT * vec1,
647  unsigned int start1,
648  unsigned int inc1,
649  unsigned int size1,
650 
651  NumericT fac2,
652  unsigned int options2,
653  const NumericT * vec2,
654  unsigned int start2,
655  unsigned int inc2,
656 
657  NumericT fac3,
658  unsigned int options3,
659  const NumericT * vec3,
660  unsigned int start3,
661  unsigned int inc3)
662 {
663  NumericT alpha = fac2;
664  if (options2 & (1 << 0))
665  alpha = -alpha;
666 
667  NumericT beta = fac3;
668  if (options3 & (1 << 0))
669  beta = -beta;
670 
671  if (options2 & (1 << 1))
672  {
673  if (options3 & (1 << 1))
674  {
675  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
676  i < size1;
677  i += gridDim.x * blockDim.x)
678  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] / beta;
679  }
680  else
681  {
682  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
683  i < size1;
684  i += gridDim.x * blockDim.x)
685  vec1[i*inc1+start1] += vec2[i*inc2+start2] / alpha + vec3[i*inc3+start3] * beta;
686  }
687  }
688  else
689  {
690  if (options3 & (1 << 1))
691  {
692  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
693  i < size1;
694  i += gridDim.x * blockDim.x)
695  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] / beta;
696  }
697  else
698  {
699  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
700  i < size1;
701  i += gridDim.x * blockDim.x)
702  vec1[i*inc1+start1] += vec2[i*inc2+start2] * alpha + vec3[i*inc3+start3] * beta;
703  }
704  }
705 }
706 
707 
708 template<typename NumericT, typename ScalarT1, typename ScalarT2>
710  vector_base<NumericT> const & vec2, ScalarT1 const & alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha,
711  vector_base<NumericT> const & vec3, ScalarT2 const & beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
712 {
713  typedef NumericT value_type;
714 
715  unsigned int options_alpha = detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
716 
717  value_type data_alpha = alpha;
718  if (flip_sign_alpha)
719  data_alpha = -data_alpha;
720  if (reciprocal_alpha)
721  data_alpha = static_cast<value_type>(1) / data_alpha;
722 
723  value_type temporary_alpha = 0;
725  temporary_alpha = alpha;
726 
727  unsigned int options_beta = detail::make_options(len_beta, reciprocal_beta, flip_sign_beta);
728 
729  value_type temporary_beta = 0;
731  temporary_beta = beta;
732 
733 
734  avbv_v_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
735  static_cast<unsigned int>(viennacl::traits::start(vec1)),
736  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
737  static_cast<unsigned int>(viennacl::traits::size(vec1)),
738 
739  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
740  options_alpha,
741  detail::cuda_arg<value_type>(vec2),
742  static_cast<unsigned int>(viennacl::traits::start(vec2)),
743  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
744 
745  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)),
746  options_beta,
747  detail::cuda_arg<value_type>(vec3),
748  static_cast<unsigned int>(viennacl::traits::start(vec3)),
749  static_cast<unsigned int>(viennacl::traits::stride(vec3)) );
750 }
751 
752 
754 
755 template<typename NumericT>
756 __global__ void vector_assign_kernel(NumericT * vec1,
757  unsigned int start1,
758  unsigned int inc1,
759  unsigned int size1,
760  unsigned int internal_size1,
761 
762  NumericT alpha)
763 {
764  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
765  i < size1;
766  i += gridDim.x * blockDim.x)
767  vec1[i*inc1+start1] = (i < size1) ? alpha : 0;
768 }
769 
776 template<typename NumericT, typename ScalarT1>
777 void vector_assign(vector_base<NumericT> & vec1, ScalarT1 const & alpha, bool up_to_internal_size = false)
778 {
779  typedef NumericT value_type;
780 
781  value_type temporary_alpha = 0;
783  temporary_alpha = alpha;
784 
785  unsigned int size = up_to_internal_size ? static_cast<unsigned int>(vec1.internal_size()) : static_cast<unsigned int>(viennacl::traits::size(vec1));
786 
787  vector_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
788  static_cast<unsigned int>(viennacl::traits::start(vec1)),
789  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
790  size,
791  static_cast<unsigned int>(vec1.internal_size()), //Note: Do NOT use traits::internal_size() here, because vector proxies don't require padding.
792 
793  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)) );
794  VIENNACL_CUDA_LAST_ERROR_CHECK("avbv_v_kernel");
795 }
796 
798 
799 template<typename NumericT>
800 __global__ void vector_swap_kernel(NumericT * vec1,
801  unsigned int start1,
802  unsigned int inc1,
803  unsigned int size1,
804 
805  NumericT * vec2,
806  unsigned int start2,
807  unsigned int inc2)
808 {
809  NumericT tmp;
810  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
811  i < size1;
812  i += gridDim.x * blockDim.x)
813  {
814  tmp = vec2[i*inc2+start2];
815  vec2[i*inc2+start2] = vec1[i*inc1+start1];
816  vec1[i*inc1+start1] = tmp;
817  }
818 }
819 
820 
826 template<typename NumericT>
828 {
829  typedef NumericT value_type;
830 
831  vector_swap_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
832  static_cast<unsigned int>(viennacl::traits::start(vec1)),
833  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
834  static_cast<unsigned int>(viennacl::traits::size(vec1)),
835 
836  detail::cuda_arg<value_type>(vec2),
837  static_cast<unsigned int>(viennacl::traits::start(vec2)),
838  static_cast<unsigned int>(viennacl::traits::stride(vec2)) );
839  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_swap_kernel");
840 }
841 
843 
844 template<typename NumericT>
845 __global__ void element_op_kernel(NumericT * vec1,
846  unsigned int start1,
847  unsigned int inc1,
848  unsigned int size1,
849 
850  NumericT const * vec2,
851  unsigned int start2,
852  unsigned int inc2,
853 
854  NumericT const * vec3,
855  unsigned int start3,
856  unsigned int inc3,
857 
858  unsigned int op_type
859  )
860 {
861  if (op_type == 2)
862  {
863  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
864  i < size1;
865  i += gridDim.x * blockDim.x)
866  {
867  vec1[i*inc1+start1] = pow(vec2[i*inc2+start2], vec3[i*inc3+start3]);
868  }
869  }
870  else if (op_type == 1)
871  {
872  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
873  i < size1;
874  i += gridDim.x * blockDim.x)
875  {
876  vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
877  }
878  }
879  else if (op_type == 0)
880  {
881  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
882  i < size1;
883  i += gridDim.x * blockDim.x)
884  {
885  vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
886  }
887  }
888 }
889 
890 template<typename NumericT>
891 __global__ void element_op_int_kernel(NumericT * vec1,
892  unsigned int start1,
893  unsigned int inc1,
894  unsigned int size1,
895 
896  NumericT const * vec2,
897  unsigned int start2,
898  unsigned int inc2,
899 
900  NumericT const * vec3,
901  unsigned int start3,
902  unsigned int inc3,
903 
904  unsigned int op_type
905  )
906 {
907  if (op_type == 1)
908  {
909  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
910  i < size1;
911  i += gridDim.x * blockDim.x)
912  {
913  vec1[i*inc1+start1] = vec2[i*inc2+start2] / vec3[i*inc3+start3];
914  }
915  }
916  else if (op_type == 0)
917  {
918  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
919  i < size1;
920  i += gridDim.x * blockDim.x)
921  {
922  vec1[i*inc1+start1] = vec2[i*inc2+start2] * vec3[i*inc3+start3];
923  }
924  }
925 }
926 
932 template<typename NumericT, typename OpT>
935 {
936  typedef NumericT value_type;
937 
938  unsigned int op_type = 2; //0: product, 1: division, 2: power
940  op_type = 1;
942  op_type = 0;
943 
944  element_op_int_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
945  static_cast<unsigned int>(viennacl::traits::start(vec1)),
946  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
947  static_cast<unsigned int>(viennacl::traits::size(vec1)),
948 
949  detail::cuda_arg<value_type>(proxy.lhs()),
950  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
951  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
952 
953  detail::cuda_arg<value_type>(proxy.rhs()),
954  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
955  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
956 
957  op_type
958  );
959  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
960 }
961 
962 template<typename OpT>
965 {
966  typedef float value_type;
967 
968  unsigned int op_type = 2; //0: product, 1: division, 2: power
970  op_type = 1;
972  op_type = 0;
973 
974  element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
975  static_cast<unsigned int>(viennacl::traits::start(vec1)),
976  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
977  static_cast<unsigned int>(viennacl::traits::size(vec1)),
978 
979  detail::cuda_arg<value_type>(proxy.lhs()),
980  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
981  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
982 
983  detail::cuda_arg<value_type>(proxy.rhs()),
984  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
985  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
986 
987  op_type
988  );
989  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
990 }
991 
992 template<typename OpT>
995 {
996  typedef double value_type;
997 
998  unsigned int op_type = 2; //0: product, 1: division, 2: power
1000  op_type = 1;
1002  op_type = 0;
1003 
1004  element_op_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1005  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1006  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1007  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1008 
1009  detail::cuda_arg<value_type>(proxy.lhs()),
1010  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1011  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs())),
1012 
1013  detail::cuda_arg<value_type>(proxy.rhs()),
1014  static_cast<unsigned int>(viennacl::traits::start(proxy.rhs())),
1015  static_cast<unsigned int>(viennacl::traits::stride(proxy.rhs())),
1016 
1017  op_type
1018  );
1019  VIENNACL_CUDA_LAST_ERROR_CHECK("element_op_kernel");
1020 }
1021 
1023 
1024 // Note: Trying to automate things with macros or template metaprogramming failed (preprocessor with nvcc did not work as expected), so this is terribly hand-rolled code
1025 // Question (Karl Rupp): Why is CUDA code always such a hassle when trying to use it in a library context?
1026 
1027 // acos
1028 template<typename NumericT>
1029 __global__ void vec_element_acos_kernel(
1030  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1031  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1032 {
1033  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1034  vec1[i*inc1+start1] = acos(vec2[i*inc2+start2]);
1035 }
1036 
1037 template<typename NumericT>
1040 {
1041  typedef NumericT value_type;
1042 
1043  vec_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1044  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1045  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1046  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1047  detail::cuda_arg<value_type>(proxy.lhs()),
1048  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1049  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1050  );
1051  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_acos_kernel");
1052 }
1053 
1054 // asin
1055 template<typename NumericT>
1056 __global__ void vec_element_asin_kernel(
1057  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1058  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1059 {
1060  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1061  vec1[i*inc1+start1] = asin(vec2[i*inc2+start2]);
1062 }
1063 
1064 template<typename NumericT>
1067 {
1068  typedef NumericT value_type;
1069 
1070  vec_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1071  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1072  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1073  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1074  detail::cuda_arg<value_type>(proxy.lhs()),
1075  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1076  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1077  );
1078  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_asin_kernel");
1079 }
1080 
1081 
1082 // atan
1083 template<typename NumericT>
1084 __global__ void vec_element_atan_kernel(
1085  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1086  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1087 {
1088  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1089  vec1[i*inc1+start1] = atan(vec2[i*inc2+start2]);
1090 }
1091 
1092 template<typename NumericT>
1095 {
1096  typedef NumericT value_type;
1097 
1098  vec_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1099  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1100  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1101  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1102  detail::cuda_arg<value_type>(proxy.lhs()),
1103  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1104  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1105  );
1106  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_atan_kernel");
1107 }
1108 
1109 
1110 // ceil
1111 template<typename NumericT>
1112 __global__ void vec_element_ceil_kernel(
1113  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1114  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1115 {
1116  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1117  vec1[i*inc1+start1] = ceil(vec2[i*inc2+start2]);
1118 }
1119 
1120 template<typename NumericT>
1123 {
1124  typedef NumericT value_type;
1125 
1126  vec_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1127  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1128  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1129  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1130  detail::cuda_arg<value_type>(proxy.lhs()),
1131  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1132  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1133  );
1134  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_ceil_kernel");
1135 }
1136 
1137 
1138 // cos
1139 template<typename NumericT>
1140 __global__ void vec_element_cos_kernel(
1141  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1142  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1143 {
1144  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1145  vec1[i*inc1+start1] = cos(vec2[i*inc2+start2]);
1146 }
1147 
1148 template<typename NumericT>
1151 {
1152  typedef NumericT value_type;
1153 
1154  vec_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1155  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1156  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1157  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1158  detail::cuda_arg<value_type>(proxy.lhs()),
1159  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1160  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1161  );
1162  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cos_kernel");
1163 }
1164 
1165 
1166 // cosh
1167 template<typename NumericT>
1168 __global__ void vec_element_cosh_kernel(
1169  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1170  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1171 {
1172  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1173  vec1[i*inc1+start1] = cosh(vec2[i*inc2+start2]);
1174 }
1175 
1176 template<typename NumericT>
1179 {
1180  typedef NumericT value_type;
1181 
1182  vec_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1183  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1184  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1185  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1186  detail::cuda_arg<value_type>(proxy.lhs()),
1187  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1188  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1189  );
1190  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_cosh_kernel");
1191 }
1192 
1193 
1194 // exp
1195 template<typename NumericT>
1196 __global__ void vec_element_exp_kernel(
1197  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1198  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1199 {
1200  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1201  vec1[i*inc1+start1] = exp(vec2[i*inc2+start2]);
1202 }
1203 
1204 template<typename NumericT>
1207 {
1208  typedef NumericT value_type;
1209 
1210  vec_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1211  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1212  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1213  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1214  detail::cuda_arg<value_type>(proxy.lhs()),
1215  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1216  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1217  );
1218  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_exp_kernel");
1219 }
1220 
1221 
1222 // fabs
1223 template<typename NumericT>
1224 __global__ void vec_element_fabs_kernel(
1225  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1226  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1227 {
1228  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1229  vec1[i*inc1+start1] = fabs(vec2[i*inc2+start2]);
1230 }
1231 
1232 template<typename NumericT>
1235 {
1236  typedef NumericT value_type;
1237 
1238  vec_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1239  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1240  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1241  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1242  detail::cuda_arg<value_type>(proxy.lhs()),
1243  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1244  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1245  );
1246  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_fabs_kernel");
1247 }
1248 
1249 // abs
1250 template<typename NumericT>
1251 __global__ void vec_element_abs_kernel(
1252  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1253  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1254 {
1255  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1256  vec1[i*inc1+start1] = abs(vec2[i*inc2+start2]);
1257 }
1258 
1259 template<typename NumericT>
1262 {
1263  typedef NumericT value_type;
1264 
1265  vec_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1266  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1267  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1268  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1269  detail::cuda_arg<value_type>(proxy.lhs()),
1270  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1271  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1272  );
1273  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_abs_kernel");
1274 }
1275 
1276 
1277 
1278 // floor
1279 template<typename NumericT>
1281  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1282  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1283 {
1284  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1285  vec1[i*inc1+start1] = floor(vec2[i*inc2+start2]);
1286 }
1287 
1288 template<typename NumericT>
1291 {
1292  typedef NumericT value_type;
1293 
1294  vec_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1295  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1296  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1297  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1298  detail::cuda_arg<value_type>(proxy.lhs()),
1299  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1300  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1301  );
1302  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_floor_kernel");
1303 }
1304 
1305 
1306 // log
1307 template<typename NumericT>
1308 __global__ void vec_element_log_kernel(
1309  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1310  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1311 {
1312  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1313  vec1[i*inc1+start1] = log(vec2[i*inc2+start2]);
1314 }
1315 
1316 template<typename NumericT>
1319 {
1320  typedef NumericT value_type;
1321 
1322  vec_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1323  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1324  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1325  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1326  detail::cuda_arg<value_type>(proxy.lhs()),
1327  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1328  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1329  );
1330  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log_kernel");
1331 }
1332 
1333 
1334 // log10
1335 template<typename NumericT>
1337  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1338  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1339 {
1340  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1341  vec1[i*inc1+start1] = log10(vec2[i*inc2+start2]);
1342 }
1343 
1344 template<typename NumericT>
1347 {
1348  typedef NumericT value_type;
1349 
1350  vec_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1351  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1352  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1353  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1354  detail::cuda_arg<value_type>(proxy.lhs()),
1355  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1356  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1357  );
1358  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_log10_kernel");
1359 }
1360 
1361 
1362 // sin
1363 template<typename NumericT>
1364 __global__ void vec_element_sin_kernel(
1365  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1366  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1367 {
1368  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1369  vec1[i*inc1+start1] = sin(vec2[i*inc2+start2]);
1370 }
1371 
1372 template<typename NumericT>
1375 {
1376  typedef NumericT value_type;
1377 
1378  vec_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1379  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1380  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1381  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1382  detail::cuda_arg<value_type>(proxy.lhs()),
1383  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1384  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1385  );
1386  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sin_kernel");
1387 }
1388 
1389 
1390 // sinh
1391 template<typename NumericT>
1392 __global__ void vec_element_sinh_kernel(
1393  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1394  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1395 {
1396  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1397  vec1[i*inc1+start1] = sinh(vec2[i*inc2+start2]);
1398 }
1399 
1400 template<typename NumericT>
1403 {
1404  typedef NumericT value_type;
1405 
1406  vec_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1407  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1408  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1409  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1410  detail::cuda_arg<value_type>(proxy.lhs()),
1411  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1412  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1413  );
1414  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sinh_kernel");
1415 }
1416 
1417 
1418 // sqrt
1419 template<typename NumericT>
1420 __global__ void vec_element_sqrt_kernel(
1421  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1422  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1423 {
1424  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1425  vec1[i*inc1+start1] = sqrt(vec2[i*inc2+start2]);
1426 }
1427 
1428 template<typename NumericT>
1431 {
1432  typedef NumericT value_type;
1433 
1434  vec_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1435  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1436  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1437  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1438  detail::cuda_arg<value_type>(proxy.lhs()),
1439  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1440  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1441  );
1442  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_sqrt_kernel");
1443 }
1444 
1445 
1446 // tan
1447 template<typename NumericT>
1448 __global__ void vec_element_tan_kernel(
1449  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1450  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1451 {
1452  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1453  vec1[i*inc1+start1] = tan(vec2[i*inc2+start2]);
1454 }
1455 
1456 template<typename NumericT>
1459 {
1460  typedef NumericT value_type;
1461 
1462  vec_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1463  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1464  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1465  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1466  detail::cuda_arg<value_type>(proxy.lhs()),
1467  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1468  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1469  );
1470  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tan_kernel");
1471 }
1472 
1473 
1474 // tanh
1475 template<typename NumericT>
1476 __global__ void vec_element_tanh_kernel(
1477  NumericT * vec1, unsigned int start1, unsigned int inc1, unsigned int size1,
1478  NumericT const * vec2, unsigned int start2, unsigned int inc2)
1479 {
1480  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += gridDim.x * blockDim.x)
1481  vec1[i*inc1+start1] = tanh(vec2[i*inc2+start2]);
1482 }
1483 
1484 template<typename NumericT>
1487 {
1488  typedef NumericT value_type;
1489 
1490  vec_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1491  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1492  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1493  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1494  detail::cuda_arg<value_type>(proxy.lhs()),
1495  static_cast<unsigned int>(viennacl::traits::start(proxy.lhs())),
1496  static_cast<unsigned int>(viennacl::traits::stride(proxy.lhs()))
1497  );
1498  VIENNACL_CUDA_LAST_ERROR_CHECK("vec_element_tanh_kernel");
1499 }
1500 
1501 
1502 
1504 
1505 
1506 template<typename NumericT>
1507 __global__ void inner_prod_kernel(const NumericT * vec1,
1508  unsigned int start1,
1509  unsigned int inc1,
1510  unsigned int size1,
1511  const NumericT * vec2,
1512  unsigned int start2,
1513  unsigned int inc2,
1514  unsigned int size2,
1515  NumericT * group_buffer)
1516 {
1517  __shared__ NumericT tmp_buffer[128];
1518  unsigned int group_start1 = (blockIdx.x * size1) / (gridDim.x) * inc1 + start1;
1519  unsigned int group_start2 = (blockIdx.x * size2) / (gridDim.x) * inc2 + start2;
1520 
1521  unsigned int group_size1 = ((blockIdx.x + 1) * size1) / (gridDim.x)
1522  - ( blockIdx.x * size1) / (gridDim.x);
1523 
1524 
1525  NumericT tmp = 0;
1526  for (unsigned int i = threadIdx.x; i < group_size1; i += blockDim.x)
1527  tmp += vec1[i*inc1+group_start1] * vec2[i*inc2+group_start2];
1528  tmp_buffer[threadIdx.x] = tmp;
1529 
1530  // parallel reduction
1531  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1532  {
1533  __syncthreads();
1534  if (threadIdx.x < stride)
1535  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
1536  }
1537 
1538  if (threadIdx.x == 0)
1539  group_buffer[blockIdx.x] = tmp_buffer[0];
1540 
1541 }
1542 
1543 
1544 
1545 // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
1546 template<typename NumericT>
1548  const NumericT * vec1,
1549  unsigned int start1,
1550  unsigned int inc1,
1551  unsigned int size1,
1552  unsigned int option, //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
1553  NumericT * result)
1554 {
1555  __shared__ NumericT tmp_buffer[128];
1556  NumericT thread_sum = 0;
1557  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1558  {
1559  if (option > 0)
1560  thread_sum += vec1[i*inc1+start1];
1561  else
1562  thread_sum = fmax(thread_sum, fabs(vec1[i*inc1+start1]));
1563  }
1564 
1565  tmp_buffer[threadIdx.x] = thread_sum;
1566 
1567  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1568  {
1569  __syncthreads();
1570  if (threadIdx.x < stride)
1571  {
1572  if (option > 0)
1573  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1574  else
1575  tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x + stride]);
1576  }
1577  }
1578 
1579  if (threadIdx.x == 0)
1580  {
1581  if (option == 2)
1582  *result = sqrt(tmp_buffer[0]);
1583  else
1584  *result = tmp_buffer[0];
1585  }
1586 }
1587 
1588 template<typename NumericT>
1590  const NumericT * vec1,
1591  unsigned int start1,
1592  unsigned int inc1,
1593  unsigned int size1,
1594  unsigned int option, //0: use max, 1: just sum
1595  NumericT * result)
1596 {
1597  __shared__ NumericT tmp_buffer[128];
1598  NumericT thread_sum = 0;
1599  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1600  {
1601  if (option > 0)
1602  thread_sum += vec1[i*inc1+start1];
1603  else
1604  thread_sum = thread_sum > abs(vec1[i*inc1+start1]) ? thread_sum : abs(vec1[i*inc1+start1]);
1605  }
1606 
1607  tmp_buffer[threadIdx.x] = thread_sum;
1608 
1609  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1610  {
1611  __syncthreads();
1612  if (threadIdx.x < stride)
1613  {
1614  if (option > 0)
1615  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1616  else
1617  tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
1618  }
1619  }
1620 
1621  if (threadIdx.x == 0)
1622  *result = tmp_buffer[0];
1623 }
1624 
1625 template<typename NumericT>
1627  const NumericT * vec1,
1628  unsigned int start1,
1629  unsigned int inc1,
1630  unsigned int size1,
1631  unsigned int option, //0: use max, 1: just sum
1632  NumericT * result)
1633 {
1634  __shared__ NumericT tmp_buffer[128];
1635  NumericT thread_sum = 0;
1636  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
1637  {
1638  if (option > 0)
1639  thread_sum += vec1[i*inc1+start1];
1640  else
1641  thread_sum = (thread_sum > vec1[i*inc1+start1]) ? thread_sum : vec1[i*inc1+start1];
1642  }
1643 
1644  tmp_buffer[threadIdx.x] = thread_sum;
1645 
1646  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
1647  {
1648  __syncthreads();
1649  if (threadIdx.x < stride)
1650  {
1651  if (option > 0)
1652  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
1653  else
1654  tmp_buffer[threadIdx.x] = tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x + stride] ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x + stride];
1655  }
1656  }
1657 
1658  if (threadIdx.x == 0)
1659  *result = tmp_buffer[0];
1660 }
1661 
1662 namespace detail
1663 {
1665  struct vector_sum_kernel_launcher_integers
1666  {
1667  template<typename NumericT, typename ScalarT>
1668  static void apply(vector_base<NumericT> const & temp,
1669  unsigned int option,
1670  ScalarT & result)
1671  {
1672  typedef NumericT value_type;
1673  vector_sum_kernel_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1674  static_cast<unsigned int>(viennacl::traits::start(temp)),
1675  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1676  static_cast<unsigned int>(viennacl::traits::size(temp)),
1677  static_cast<unsigned int>(option),
1678  detail::cuda_arg<value_type>(result) );
1679  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1680  }
1681  };
1682 
1683  struct vector_sum_kernel_launcher_unsigned_integers
1684  {
1685  template<typename NumericT, typename ScalarT>
1686  static void apply(vector_base<NumericT> const & temp,
1687  unsigned int option,
1688  ScalarT & result)
1689  {
1690  typedef NumericT value_type;
1691  vector_sum_kernel_unsigned_integers<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1692  static_cast<unsigned int>(viennacl::traits::start(temp)),
1693  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1694  static_cast<unsigned int>(viennacl::traits::size(temp)),
1695  static_cast<unsigned int>(option),
1696  detail::cuda_arg<value_type>(result) );
1697  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1698  }
1699  };
1700 
1701  struct vector_sum_kernel_launcher_floats
1702  {
1703  template<typename NumericT, typename ScalarT>
1704  static void apply(vector_base<NumericT> const & temp,
1705  unsigned int option,
1706  ScalarT & result)
1707  {
1708  typedef NumericT value_type;
1709  vector_sum_kernel_floats<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
1710  static_cast<unsigned int>(viennacl::traits::start(temp)),
1711  static_cast<unsigned int>(viennacl::traits::stride(temp)),
1712  static_cast<unsigned int>(viennacl::traits::size(temp)),
1713  static_cast<unsigned int>(option),
1714  detail::cuda_arg<value_type>(result) );
1715  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_sum_kernel");
1716  }
1717  };
1718 
1719  template<typename NumericT>
1720  struct vector_sum_kernel_launcher : public vector_sum_kernel_launcher_integers {};
1721 
1722  template<>
1723  struct vector_sum_kernel_launcher<unsigned char> : public vector_sum_kernel_launcher_unsigned_integers {};
1724 
1725  template<>
1726  struct vector_sum_kernel_launcher<unsigned short> : public vector_sum_kernel_launcher_unsigned_integers {};
1727 
1728  template<>
1729  struct vector_sum_kernel_launcher<unsigned int> : public vector_sum_kernel_launcher_unsigned_integers {};
1730 
1731  template<>
1732  struct vector_sum_kernel_launcher<unsigned long> : public vector_sum_kernel_launcher_unsigned_integers {};
1733 
1734  template<>
1735  struct vector_sum_kernel_launcher<float> : public vector_sum_kernel_launcher_floats {};
1736 
1737  template<>
1738  struct vector_sum_kernel_launcher<double> : public vector_sum_kernel_launcher_floats {};
1739 
1741 }
1742 
1743 
1744 //implementation of inner product:
1745 //namespace {
1752 template<typename NumericT, typename ScalarT>
1754  vector_base<NumericT> const & vec2,
1755  ScalarT & result)
1756 {
1757  typedef NumericT value_type;
1758 
1759  static const unsigned int work_groups = 128;
1760  static viennacl::vector<value_type> temp(work_groups);
1761 
1762  inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1763  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1764  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1765  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1766  detail::cuda_arg<value_type>(vec2),
1767  static_cast<unsigned int>(viennacl::traits::start(vec2)),
1768  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
1769  static_cast<unsigned int>(viennacl::traits::size(vec2)),
1770  detail::cuda_arg<value_type>(temp)
1771  );
1772  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
1773 
1774  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
1775 }
1776 
1777 
1784 template<typename NumericT>
1786  vector_base<NumericT> const & vec2,
1787  NumericT & result)
1788 {
1789  typedef NumericT value_type;
1790 
1791  const unsigned int work_groups = 128;
1792  viennacl::vector<value_type> temp(work_groups);
1793 
1794  inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
1795  static_cast<unsigned int>(viennacl::traits::start(vec1)),
1796  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
1797  static_cast<unsigned int>(viennacl::traits::size(vec1)),
1798  detail::cuda_arg<value_type>(vec2),
1799  static_cast<unsigned int>(viennacl::traits::start(vec2)),
1800  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
1801  static_cast<unsigned int>(viennacl::traits::size(vec2)),
1802  detail::cuda_arg<value_type>(temp)
1803  );
1804  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
1805 
1806  // Now copy partial results from GPU back to CPU and run reduction there:
1807  std::vector<value_type> temp_cpu(work_groups);
1808  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
1809 
1810  result = 0;
1811  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
1812  result += *it;
1813 }
1814 
1816 
1817 #define VIENNACL_MDOT_WORKGROUP_SIZE 128
1818 #define VIENNACL_MDOT_WORKGROUP_NUM 128
1819 // M = 2:
1820 template<typename NumericT>
1821 __global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1822  const NumericT *y0, unsigned int start0, unsigned int stride0,
1823  const NumericT *y1, unsigned int start1, unsigned int stride1,
1824  NumericT *group_results)
1825 {
1826  __shared__ NumericT tmp_buffer[2*VIENNACL_MDOT_WORKGROUP_SIZE];
1827  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1828  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1829  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond size of x
1830 
1831  NumericT entry_x = 0;
1832  NumericT group_sum0 = 0;
1833  NumericT group_sum1 = 0;
1834  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1835  entry_x = x[i * stridex + startx]; // load only once from global memory!
1836  group_sum0 += entry_x * y0[i * stride0 + start0];
1837  group_sum1 += entry_x * y1[i * stride1 + start1];
1838  }
1839  tmp_buffer[threadIdx.x] = group_sum0;
1840  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1841 
1842  // parallel reduction
1843  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1844  __syncthreads();
1845  if (threadIdx.x < stride) {
1846  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1847  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1848  }
1849  }
1850 
1851  // write result of group to group_results
1852  if (threadIdx.x == 0) {
1853  group_results[blockIdx.x] = tmp_buffer[0];
1854  group_results[blockIdx.x + gridDim.x] = tmp_buffer[blockDim.x];
1855  }
1856 }
1857 
1858 // M = 3:
1859 template<typename NumericT>
1860 __global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1861  const NumericT *y0, unsigned int start0, unsigned int stride0,
1862  const NumericT *y1, unsigned int start1, unsigned int stride1,
1863  const NumericT *y2, unsigned int start2, unsigned int stride2,
1864  NumericT *group_results)
1865 {
1866  __shared__ NumericT tmp_buffer[3*VIENNACL_MDOT_WORKGROUP_SIZE];
1867  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1868  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1869  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1870 
1871  NumericT entry_x = 0;
1872  NumericT group_sum0 = 0;
1873  NumericT group_sum1 = 0;
1874  NumericT group_sum2 = 0;
1875  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1876  entry_x = x[i * stridex + startx]; // load only once from global memory!
1877  group_sum0 += entry_x * y0[i * stride0 + start0];
1878  group_sum1 += entry_x * y1[i * stride1 + start1];
1879  group_sum2 += entry_x * y2[i * stride2 + start2];
1880  }
1881  tmp_buffer[threadIdx.x] = group_sum0;
1882  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1883  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1884 
1885  // parallel reduction
1886  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1887  __syncthreads();
1888  if (threadIdx.x < stride) {
1889  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1890  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1891  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1892  }
1893  }
1894 
1895  // write result of group to group_results
1896  if (threadIdx.x == 0) {
1897  group_results[blockIdx.x ] = tmp_buffer[0];
1898  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1899  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1900  }
1901 }
1902 
1903 // M = 4:
1904 template<typename NumericT>
1905 __global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1906  const NumericT *y0, unsigned int start0, unsigned int stride0,
1907  const NumericT *y1, unsigned int start1, unsigned int stride1,
1908  const NumericT *y2, unsigned int start2, unsigned int stride2,
1909  const NumericT *y3, unsigned int start3, unsigned int stride3,
1910  NumericT *group_results)
1911 {
1912  __shared__ NumericT tmp_buffer[4*VIENNACL_MDOT_WORKGROUP_SIZE];
1913  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1914  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1915  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1916 
1917  NumericT entry_x = 0;
1918  NumericT group_sum0 = 0;
1919  NumericT group_sum1 = 0;
1920  NumericT group_sum2 = 0;
1921  NumericT group_sum3 = 0;
1922  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1923  entry_x = x[i * stridex + startx]; // load only once from global memory!
1924  group_sum0 += entry_x * y0[i * stride0 + start0];
1925  group_sum1 += entry_x * y1[i * stride1 + start1];
1926  group_sum2 += entry_x * y2[i * stride2 + start2];
1927  group_sum3 += entry_x * y3[i * stride3 + start3];
1928  }
1929  tmp_buffer[threadIdx.x] = group_sum0;
1930  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1931  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1932  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1933 
1934  // parallel reduction
1935  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
1936  __syncthreads();
1937  if (threadIdx.x < stride) {
1938  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
1939  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
1940  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
1941  tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
1942  }
1943  }
1944 
1945  // write result of group to group_results
1946  if (threadIdx.x == 0) {
1947  group_results[blockIdx.x ] = tmp_buffer[0];
1948  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
1949  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
1950  group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
1951  }
1952 }
1953 
1954 // M = 8:
1955 template<typename NumericT>
1956 __global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex,
1957  const NumericT *y0, unsigned int start0, unsigned int stride0,
1958  const NumericT *y1, unsigned int start1, unsigned int stride1,
1959  const NumericT *y2, unsigned int start2, unsigned int stride2,
1960  const NumericT *y3, unsigned int start3, unsigned int stride3,
1961  const NumericT *y4, unsigned int start4, unsigned int stride4,
1962  const NumericT *y5, unsigned int start5, unsigned int stride5,
1963  const NumericT *y6, unsigned int start6, unsigned int stride6,
1964  const NumericT *y7, unsigned int start7, unsigned int stride7,
1965  NumericT *group_results)
1966 {
1967  __shared__ NumericT tmp_buffer[8*VIENNACL_MDOT_WORKGROUP_SIZE];
1968  unsigned int entries_per_thread = (sizex - 1) / (blockDim.x * gridDim.x) + 1;
1969  unsigned int vec_start_index = blockIdx.x * blockDim.x * entries_per_thread;
1970  unsigned int vec_stop_index = min((blockIdx.x + 1) * blockDim.x * entries_per_thread, sizex); // don't go beyond vec size
1971 
1972  NumericT entry_x = 0;
1973  NumericT group_sum0 = 0;
1974  NumericT group_sum1 = 0;
1975  NumericT group_sum2 = 0;
1976  NumericT group_sum3 = 0;
1977  NumericT group_sum4 = 0;
1978  NumericT group_sum5 = 0;
1979  NumericT group_sum6 = 0;
1980  NumericT group_sum7 = 0;
1981  for (unsigned int i = vec_start_index + threadIdx.x; i < vec_stop_index; i += blockDim.x) {
1982  entry_x = x[i * stridex + startx]; // load only once from global memory!
1983  group_sum0 += entry_x * y0[i * stride0 + start0];
1984  group_sum1 += entry_x * y1[i * stride1 + start1];
1985  group_sum2 += entry_x * y2[i * stride2 + start2];
1986  group_sum3 += entry_x * y3[i * stride3 + start3];
1987  group_sum4 += entry_x * y4[i * stride4 + start4];
1988  group_sum5 += entry_x * y5[i * stride5 + start5];
1989  group_sum6 += entry_x * y6[i * stride6 + start6];
1990  group_sum7 += entry_x * y7[i * stride7 + start7];
1991  }
1992  tmp_buffer[threadIdx.x] = group_sum0;
1993  tmp_buffer[threadIdx.x + blockDim.x] = group_sum1;
1994  tmp_buffer[threadIdx.x + 2 * blockDim.x] = group_sum2;
1995  tmp_buffer[threadIdx.x + 3 * blockDim.x] = group_sum3;
1996  tmp_buffer[threadIdx.x + 4 * blockDim.x] = group_sum4;
1997  tmp_buffer[threadIdx.x + 5 * blockDim.x] = group_sum5;
1998  tmp_buffer[threadIdx.x + 6 * blockDim.x] = group_sum6;
1999  tmp_buffer[threadIdx.x + 7 * blockDim.x] = group_sum7;
2000 
2001  // parallel reduction
2002  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2) {
2003  __syncthreads();
2004  if (threadIdx.x < stride) {
2005  tmp_buffer[threadIdx.x ] += tmp_buffer[threadIdx.x+stride ];
2006  tmp_buffer[threadIdx.x + blockDim.x] += tmp_buffer[threadIdx.x+stride + blockDim.x];
2007  tmp_buffer[threadIdx.x + 2 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 2 * blockDim.x];
2008  tmp_buffer[threadIdx.x + 3 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 3 * blockDim.x];
2009  tmp_buffer[threadIdx.x + 4 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 4 * blockDim.x];
2010  tmp_buffer[threadIdx.x + 5 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 5 * blockDim.x];
2011  tmp_buffer[threadIdx.x + 6 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 6 * blockDim.x];
2012  tmp_buffer[threadIdx.x + 7 * blockDim.x] += tmp_buffer[threadIdx.x+stride + 7 * blockDim.x];
2013  }
2014  }
2015 
2016  // write result of group to group_results
2017  if (threadIdx.x == 0) {
2018  group_results[blockIdx.x ] = tmp_buffer[0];
2019  group_results[blockIdx.x + gridDim.x] = tmp_buffer[ blockDim.x];
2020  group_results[blockIdx.x + 2 * gridDim.x] = tmp_buffer[2 * blockDim.x];
2021  group_results[blockIdx.x + 3 * gridDim.x] = tmp_buffer[3 * blockDim.x];
2022  group_results[blockIdx.x + 4 * gridDim.x] = tmp_buffer[4 * blockDim.x];
2023  group_results[blockIdx.x + 5 * gridDim.x] = tmp_buffer[5 * blockDim.x];
2024  group_results[blockIdx.x + 6 * gridDim.x] = tmp_buffer[6 * blockDim.x];
2025  group_results[blockIdx.x + 7 * gridDim.x] = tmp_buffer[7 * blockDim.x];
2026  }
2027 }
2028 
2029 // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
2030 template<typename NumericT>
2031 __global__ void vector_multi_sum_kernel(
2032  NumericT const * vec1,
2033  NumericT * result,
2034  unsigned int start_result,
2035  unsigned int inc_result)
2036 {
2037  __shared__ NumericT tmp_buffer[VIENNACL_MDOT_WORKGROUP_SIZE];
2038 
2039  tmp_buffer[threadIdx.x] = vec1[threadIdx.x + blockIdx.x * VIENNACL_MDOT_WORKGROUP_SIZE];
2040 
2041  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2042  {
2043  __syncthreads();
2044  if (threadIdx.x < stride)
2045  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x + stride];
2046  }
2047 
2048  if (threadIdx.x == 0)
2049  result[start_result + inc_result * blockIdx.x] = tmp_buffer[0];
2050 }
2051 
2052 template<typename NumericT>
2054  vector_tuple<NumericT> const & vec_tuple,
2055  vector_base<NumericT> & result)
2056 {
2057  typedef NumericT value_type;
2058 
2060 
2061  vcl_size_t current_index = 0;
2062  while (vec_tuple.const_size() > current_index)
2063  {
2064  switch (vec_tuple.const_size() - current_index)
2065  {
2066  case 7:
2067  case 6:
2068  case 5:
2069  case 4:
2070  {
2071  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2072  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2073  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
2074  vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
2075 
2077  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2078  static_cast<unsigned int>(viennacl::traits::start(x)),
2079  static_cast<unsigned int>(viennacl::traits::stride(x)),
2080  static_cast<unsigned int>(viennacl::traits::size(x)),
2081  detail::cuda_arg<value_type>(y0),
2082  static_cast<unsigned int>(viennacl::traits::start(y0)),
2083  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2084  detail::cuda_arg<value_type>(y1),
2085  static_cast<unsigned int>(viennacl::traits::start(y1)),
2086  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2087  detail::cuda_arg<value_type>(y2),
2088  static_cast<unsigned int>(viennacl::traits::start(y2)),
2089  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2090  detail::cuda_arg<value_type>(y3),
2091  static_cast<unsigned int>(viennacl::traits::start(y3)),
2092  static_cast<unsigned int>(viennacl::traits::stride(y3)),
2093  detail::cuda_arg<value_type>(temp)
2094  );
2095  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_4_kernel");
2096  vector_multi_sum_kernel<<<4, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2097  detail::cuda_arg<value_type>(result),
2098  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2099  static_cast<unsigned int>(viennacl::traits::stride(result))
2100  );
2101  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2102  }
2103  current_index += 4;
2104  break;
2105  case 3:
2106  {
2107  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2108  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2109  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
2110 
2112  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2113  static_cast<unsigned int>(viennacl::traits::start(x)),
2114  static_cast<unsigned int>(viennacl::traits::stride(x)),
2115  static_cast<unsigned int>(viennacl::traits::size(x)),
2116  detail::cuda_arg<value_type>(y0),
2117  static_cast<unsigned int>(viennacl::traits::start(y0)),
2118  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2119  detail::cuda_arg<value_type>(y1),
2120  static_cast<unsigned int>(viennacl::traits::start(y1)),
2121  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2122  detail::cuda_arg<value_type>(y2),
2123  static_cast<unsigned int>(viennacl::traits::start(y2)),
2124  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2125  detail::cuda_arg<value_type>(temp)
2126  );
2127  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_3_kernel");
2128  vector_multi_sum_kernel<<<3, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2129  detail::cuda_arg<value_type>(result),
2130  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2131  static_cast<unsigned int>(viennacl::traits::stride(result))
2132  );
2133  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2134  }
2135  current_index += 3;
2136  break;
2137  case 2:
2138  {
2139  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2140  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2141 
2143  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2144  static_cast<unsigned int>(viennacl::traits::start(x)),
2145  static_cast<unsigned int>(viennacl::traits::stride(x)),
2146  static_cast<unsigned int>(viennacl::traits::size(x)),
2147  detail::cuda_arg<value_type>(y0),
2148  static_cast<unsigned int>(viennacl::traits::start(y0)),
2149  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2150  detail::cuda_arg<value_type>(y1),
2151  static_cast<unsigned int>(viennacl::traits::start(y1)),
2152  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2153  detail::cuda_arg<value_type>(temp)
2154  );
2155  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_2_kernel");
2156  vector_multi_sum_kernel<<<2, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2157  detail::cuda_arg<value_type>(result),
2158  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2159  static_cast<unsigned int>(viennacl::traits::stride(result))
2160  );
2161  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2162  }
2163  current_index += 2;
2164  break;
2165  case 1:
2166  {
2167  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2168  inner_prod_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(x),
2169  static_cast<unsigned int>(viennacl::traits::start(x)),
2170  static_cast<unsigned int>(viennacl::traits::stride(x)),
2171  static_cast<unsigned int>(viennacl::traits::size(x)),
2172  detail::cuda_arg<value_type>(y0),
2173  static_cast<unsigned int>(viennacl::traits::start(y0)),
2174  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2175  static_cast<unsigned int>(viennacl::traits::size(y0)),
2176  detail::cuda_arg<value_type>(temp)
2177  );
2178  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_kernel");
2179 
2180  vector_multi_sum_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(temp),
2181  detail::cuda_arg<value_type>(result),
2182  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2183  static_cast<unsigned int>(viennacl::traits::stride(result))
2184  );
2185  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2186  }
2187  current_index += 1;
2188  break;
2189 
2190  default:
2191  {
2192  vector_base<NumericT> const & y0 = vec_tuple.const_at(current_index);
2193  vector_base<NumericT> const & y1 = vec_tuple.const_at(current_index + 1);
2194  vector_base<NumericT> const & y2 = vec_tuple.const_at(current_index + 2);
2195  vector_base<NumericT> const & y3 = vec_tuple.const_at(current_index + 3);
2196  vector_base<NumericT> const & y4 = vec_tuple.const_at(current_index + 4);
2197  vector_base<NumericT> const & y5 = vec_tuple.const_at(current_index + 5);
2198  vector_base<NumericT> const & y6 = vec_tuple.const_at(current_index + 6);
2199  vector_base<NumericT> const & y7 = vec_tuple.const_at(current_index + 7);
2200 
2202  VIENNACL_MDOT_WORKGROUP_SIZE>>>( detail::cuda_arg<value_type>(x),
2203  static_cast<unsigned int>(viennacl::traits::start(x)),
2204  static_cast<unsigned int>(viennacl::traits::stride(x)),
2205  static_cast<unsigned int>(viennacl::traits::size(x)),
2206  detail::cuda_arg<value_type>(y0),
2207  static_cast<unsigned int>(viennacl::traits::start(y0)),
2208  static_cast<unsigned int>(viennacl::traits::stride(y0)),
2209  detail::cuda_arg<value_type>(y1),
2210  static_cast<unsigned int>(viennacl::traits::start(y1)),
2211  static_cast<unsigned int>(viennacl::traits::stride(y1)),
2212  detail::cuda_arg<value_type>(y2),
2213  static_cast<unsigned int>(viennacl::traits::start(y2)),
2214  static_cast<unsigned int>(viennacl::traits::stride(y2)),
2215  detail::cuda_arg<value_type>(y3),
2216  static_cast<unsigned int>(viennacl::traits::start(y3)),
2217  static_cast<unsigned int>(viennacl::traits::stride(y3)),
2218  detail::cuda_arg<value_type>(y4),
2219  static_cast<unsigned int>(viennacl::traits::start(y4)),
2220  static_cast<unsigned int>(viennacl::traits::stride(y4)),
2221  detail::cuda_arg<value_type>(y5),
2222  static_cast<unsigned int>(viennacl::traits::start(y5)),
2223  static_cast<unsigned int>(viennacl::traits::stride(y5)),
2224  detail::cuda_arg<value_type>(y6),
2225  static_cast<unsigned int>(viennacl::traits::start(y6)),
2226  static_cast<unsigned int>(viennacl::traits::stride(y6)),
2227  detail::cuda_arg<value_type>(y7),
2228  static_cast<unsigned int>(viennacl::traits::start(y7)),
2229  static_cast<unsigned int>(viennacl::traits::stride(y7)),
2230  detail::cuda_arg<value_type>(temp)
2231  );
2232  VIENNACL_CUDA_LAST_ERROR_CHECK("inner_prod_8_kernel");
2233  vector_multi_sum_kernel<<<8, VIENNACL_MDOT_WORKGROUP_NUM>>>(detail::cuda_arg<value_type>(temp),
2234  detail::cuda_arg<value_type>(result),
2235  static_cast<unsigned int>(viennacl::traits::start(result) + viennacl::traits::stride(result) * current_index),
2236  static_cast<unsigned int>(viennacl::traits::stride(result))
2237  );
2238  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_multi_sum_kernel");
2239  }
2240  current_index += 8;
2241  break;
2242  }
2243  }
2244 }
2245 
2246 #undef VIENNACL_MDOT_WORKGROUP_NUM
2247 #undef VIENNACL_MDOT_WORKGROUP_SIZE
2248 
2250 
2251 template<typename NumericT>
2252 __global__ void norm_kernel_floats(
2253  const NumericT * vec,
2254  unsigned int start1,
2255  unsigned int inc1,
2256  unsigned int size1,
2257  unsigned int norm_selector,
2258  NumericT * group_buffer)
2259 {
2260  __shared__ NumericT tmp_buffer[128];
2261 
2262  NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2263  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2264  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2265  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2266  group_stop = (group_stop > size1) ? size1 : group_stop;
2267 
2268  if (norm_selector == 1) //norm_1
2269  {
2270  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2271  tmp += fabs(vec[i*inc1 + start1]);
2272  }
2273  else if (norm_selector == 2) //norm_2
2274  {
2275  NumericT vec_entry = 0;
2276  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2277  {
2278  vec_entry = vec[i*inc1 + start1];
2279  tmp += vec_entry * vec_entry;
2280  }
2281  }
2282  else if (norm_selector == 0) //norm_inf
2283  {
2284  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2285  tmp = fmax(fabs(vec[i*inc1 + start1]), tmp);
2286  }
2287  else if (norm_selector == 3) //min
2288  {
2289  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2290  tmp = (vec[i*inc1 + start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2291  }
2292  else if (norm_selector == 4) //max
2293  {
2294  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2295  tmp = (vec[i*inc1 + start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2296  }
2297 
2298  tmp_buffer[threadIdx.x] = tmp;
2299 
2300  if (norm_selector == 1 || norm_selector == 2) //parallel reduction for norm_1 or norm_2:
2301  {
2302  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2303  {
2304  __syncthreads();
2305  if (threadIdx.x < stride)
2306  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2307  }
2308  }
2309  else if (norm_selector == 3)
2310  {
2311  //min:
2312  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2313  {
2314  __syncthreads();
2315  if (threadIdx.x < stride)
2316  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2317  }
2318  }
2319  else if (norm_selector == 4)
2320  {
2321  //max:
2322  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2323  {
2324  __syncthreads();
2325  if (threadIdx.x < stride)
2326  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2327  }
2328  }
2329  else
2330  {
2331  //norm_inf:
2332  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2333  {
2334  __syncthreads();
2335  if (threadIdx.x < stride)
2336  tmp_buffer[threadIdx.x] = fmax(tmp_buffer[threadIdx.x], tmp_buffer[threadIdx.x+stride]);
2337  }
2338  }
2339 
2340  if (threadIdx.x == 0)
2341  group_buffer[blockIdx.x] = tmp_buffer[0];
2342 }
2343 
2344 template<typename NumericT>
2345 __global__ void norm_kernel_integers(
2346  const NumericT * vec,
2347  unsigned int start1,
2348  unsigned int inc1,
2349  unsigned int size1,
2350  unsigned int norm_selector,
2351  NumericT * group_buffer)
2352 {
2353  __shared__ NumericT tmp_buffer[128];
2354 
2355  NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2356  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2357  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2358  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2359  group_stop = (group_stop > size1) ? size1 : group_stop;
2360 
2361  if (norm_selector == 1) //norm_1
2362  {
2363  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2364  tmp += abs(vec[i*inc1 + start1]);
2365  }
2366  else if (norm_selector == 0) //norm_inf
2367  {
2368  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2369  tmp = (tmp > abs(vec[i*inc1 + start1])) ? tmp : abs(vec[i*inc1 + start1]);
2370  }
2371  else if (norm_selector == 3) //min
2372  {
2373  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2374  tmp = (vec[i*inc1 + start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2375  }
2376  else if (norm_selector == 4) //max
2377  {
2378  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2379  tmp = (vec[i*inc1 + start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2380  }
2381 
2382  tmp_buffer[threadIdx.x] = tmp;
2383 
2384  if (norm_selector == 1 || norm_selector == 2) //parallel reduction for norm_1 or norm_2:
2385  {
2386  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2387  {
2388  __syncthreads();
2389  if (threadIdx.x < stride)
2390  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2391  }
2392  }
2393  else if (norm_selector == 3)
2394  {
2395  //min:
2396  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2397  {
2398  __syncthreads();
2399  if (threadIdx.x < stride)
2400  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2401  }
2402  }
2403  else if (norm_selector == 4)
2404  {
2405  //max:
2406  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2407  {
2408  __syncthreads();
2409  if (threadIdx.x < stride)
2410  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2411  }
2412  }
2413  else
2414  {
2415  //norm_inf:
2416  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2417  {
2418  __syncthreads();
2419  if (threadIdx.x < stride)
2420  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
2421  }
2422  }
2423 
2424  if (threadIdx.x == 0)
2425  group_buffer[blockIdx.x] = tmp_buffer[0];
2426 }
2427 
2428 template<typename NumericT>
2430  const NumericT * vec,
2431  unsigned int start1,
2432  unsigned int inc1,
2433  unsigned int size1,
2434  unsigned int norm_selector,
2435  NumericT * group_buffer)
2436 {
2437  __shared__ NumericT tmp_buffer[128];
2438 
2439  NumericT tmp = (norm_selector > 2) ? vec[start1] : 0;
2440  unsigned int work_per_thread = (size1 - 1) / (gridDim.x * blockDim.x) + 1;
2441  unsigned int group_start = blockIdx.x * work_per_thread * blockDim.x;
2442  unsigned int group_stop = (blockIdx.x + 1) * work_per_thread * blockDim.x;
2443  group_stop = (group_stop > size1) ? size1 : group_stop;
2444 
2445  if (norm_selector == 1) //norm_1
2446  {
2447  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2448  tmp += vec[i*inc1 + start1];
2449  }
2450  else if (norm_selector == 0) //norm_inf
2451  {
2452  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2453  tmp = (tmp > vec[i*inc1 + start1]) ? tmp : vec[i*inc1 + start1];
2454  }
2455  else if (norm_selector == 3) //min
2456  {
2457  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2458  tmp = (vec[i*inc1 + start1] < tmp) ? vec[i*inc1 + start1] : tmp;
2459  }
2460  else if (norm_selector == 4) //max
2461  {
2462  for (unsigned int i = group_start + threadIdx.x; i < group_stop; i += blockDim.x)
2463  tmp = (vec[i*inc1 + start1] > tmp) ? vec[i*inc1 + start1] : tmp;
2464  }
2465 
2466  tmp_buffer[threadIdx.x] = tmp;
2467 
2468  if (norm_selector == 1 || norm_selector == 2) //parallel reduction for norm_1 or norm_2:
2469  {
2470  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2471  {
2472  __syncthreads();
2473  if (threadIdx.x < stride)
2474  tmp_buffer[threadIdx.x] += tmp_buffer[threadIdx.x+stride];
2475  }
2476  }
2477  else if (norm_selector == 3)
2478  {
2479  //min:
2480  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2481  {
2482  __syncthreads();
2483  if (threadIdx.x < stride)
2484  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2485  }
2486  }
2487  else if (norm_selector == 4)
2488  {
2489  //max:
2490  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2491  {
2492  __syncthreads();
2493  if (threadIdx.x < stride)
2494  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x+stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x+stride] : tmp_buffer[threadIdx.x];
2495  }
2496  }
2497  else
2498  {
2499  //norm_inf:
2500  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2501  {
2502  __syncthreads();
2503  if (threadIdx.x < stride)
2504  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x] > tmp_buffer[threadIdx.x+stride]) ? tmp_buffer[threadIdx.x] : tmp_buffer[threadIdx.x+stride];
2505  }
2506  }
2507 
2508  if (threadIdx.x == 0)
2509  group_buffer[blockIdx.x] = tmp_buffer[0];
2510 }
2511 
2513 namespace detail
2514 {
2515  struct norm_kernel_launcher_integers
2516  {
2517  template<typename NumericT>
2518  static void apply(vector_base<NumericT> const & vec1,
2519  vector_base<NumericT> & temp,
2520  unsigned int option)
2521  {
2522  typedef NumericT value_type;
2523  norm_kernel_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2524  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2525  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2526  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2527  static_cast<unsigned int>(option),
2528  detail::cuda_arg<value_type>(temp)
2529  );
2530  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2531  }
2532  };
2533 
2534  struct norm_kernel_launcher_unsigned_integers
2535  {
2536  template<typename NumericT>
2537  static void apply(vector_base<NumericT> const & vec1,
2538  vector_base<NumericT> & temp,
2539  unsigned int option)
2540  {
2541  typedef NumericT value_type;
2542  norm_kernel_unsigned_integers<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2543  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2544  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2545  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2546  static_cast<unsigned int>(option),
2547  detail::cuda_arg<value_type>(temp)
2548  );
2549  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2550  }
2551  };
2552 
2553 
2554  struct norm_kernel_launcher_floats
2555  {
2556  template<typename NumericT>
2557  static void apply(vector_base<NumericT> const & vec1,
2558  vector_base<NumericT> & temp,
2559  unsigned int option)
2560  {
2561  typedef NumericT value_type;
2562  norm_kernel_floats<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2563  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2564  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2565  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2566  static_cast<unsigned int>(option),
2567  detail::cuda_arg<value_type>(temp)
2568  );
2569  VIENNACL_CUDA_LAST_ERROR_CHECK("norm_kernel");
2570  }
2571  };
2572 
2573  template<typename NumericT>
2574  struct norm_kernel_launcher : public norm_kernel_launcher_integers {};
2575 
2576  template<>
2577  struct norm_kernel_launcher<unsigned char> : public norm_kernel_launcher_unsigned_integers {};
2578 
2579  template<>
2580  struct norm_kernel_launcher<unsigned short> : public norm_kernel_launcher_unsigned_integers {};
2581 
2582  template<>
2583  struct norm_kernel_launcher<unsigned int> : public norm_kernel_launcher_unsigned_integers {};
2584 
2585  template<>
2586  struct norm_kernel_launcher<unsigned long> : public norm_kernel_launcher_unsigned_integers {};
2587 
2588  template<>
2589  struct norm_kernel_launcher<float> : public norm_kernel_launcher_floats {};
2590 
2591  template<>
2592  struct norm_kernel_launcher<double> : public norm_kernel_launcher_floats {};
2593 
2594 }
2603 template<typename NumericT>
2605  scalar<NumericT> & result)
2606 {
2607  typedef NumericT value_type;
2608 
2609  vcl_size_t work_groups = 128;
2610  viennacl::vector<value_type> temp(work_groups);
2611 
2612  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2613  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 1, result);
2614 }
2615 
2621 template<typename NumericT>
2623  NumericT & result)
2624 {
2625  typedef NumericT value_type;
2626 
2627  vcl_size_t work_groups = 128;
2628  viennacl::vector<value_type> temp(work_groups);
2629 
2630  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 1);
2631 
2632  // Now copy partial results from GPU back to CPU and run reduction there:
2633  std::vector<value_type> temp_cpu(work_groups);
2634  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2635 
2636  result = 0;
2637  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2638  result += *it;
2639 }
2640 
2642 
2648 template<typename NumericT>
2650  scalar<NumericT> & result)
2651 {
2652  typedef NumericT value_type;
2653 
2654  vcl_size_t work_groups = 128;
2655  viennacl::vector<value_type> temp(work_groups);
2656 
2657  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2658 
2659  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 2, result);
2660 }
2661 
2667 template<typename NumericT>
2669  NumericT & result)
2670 {
2671  typedef NumericT value_type;
2672 
2673  vcl_size_t work_groups = 128;
2674  viennacl::vector<value_type> temp(work_groups);
2675 
2676  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 2);
2677 
2678  std::vector<value_type> temp_cpu(work_groups);
2679  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2680 
2681  result = 0;
2682  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2683  result += *it;
2684  result = std::sqrt(result);
2685 }
2686 
2687 
2689 
2695 template<typename NumericT>
2697  scalar<NumericT> & result)
2698 {
2699  typedef NumericT value_type;
2700 
2701  vcl_size_t work_groups = 128;
2702  viennacl::vector<value_type> temp(work_groups);
2703 
2704  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2705  detail::vector_sum_kernel_launcher<NumericT>::apply(temp, 0, result);
2706 }
2707 
2708 
2709 
2715 template<typename NumericT>
2717  NumericT & result)
2718 {
2719  typedef NumericT value_type;
2720 
2721  vcl_size_t work_groups = 128;
2722  viennacl::vector<value_type> temp(work_groups);
2723 
2724  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 0);
2725 
2726  std::vector<value_type> temp_cpu(work_groups);
2727  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2728 
2729  result = 0;
2730  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2731  result = std::max(result, *it);
2732 }
2733 
2734 
2736 
2737 // second reduction stage for min() and max()
2738 template<typename NumericT>
2739 __global__ void vector_maxmin_kernel(
2740  const NumericT * vec1,
2741  unsigned int start1,
2742  unsigned int inc1,
2743  unsigned int size1,
2744  unsigned int option, //0: use max, 1: use min
2745  NumericT * result)
2746 {
2747  __shared__ NumericT tmp_buffer[128];
2748  NumericT thread_minmax = vec1[start1];
2749  for (unsigned int i = threadIdx.x; i<size1; i += blockDim.x)
2750  {
2751  if (option > 0) //min
2752  thread_minmax = (vec1[i*inc1+start1] < thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2753  else
2754  thread_minmax = (vec1[i*inc1+start1] > thread_minmax) ? vec1[i*inc1+start1] : thread_minmax;
2755  }
2756 
2757  tmp_buffer[threadIdx.x] = thread_minmax;
2758 
2759  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2760  {
2761  __syncthreads();
2762  if (threadIdx.x < stride)
2763  {
2764  if (option > 0) //min
2765  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x + stride] < tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x + stride] : tmp_buffer[threadIdx.x];
2766  else
2767  tmp_buffer[threadIdx.x] = (tmp_buffer[threadIdx.x + stride] > tmp_buffer[threadIdx.x]) ? tmp_buffer[threadIdx.x + stride] : tmp_buffer[threadIdx.x];
2768  }
2769  }
2770 
2771  if (threadIdx.x == 0)
2772  *result = tmp_buffer[0];
2773 }
2774 
2775 
2781 template<typename NumericT>
2783  scalar<NumericT> & result)
2784 {
2785  typedef NumericT value_type;
2786 
2787  vcl_size_t work_groups = 128;
2789 
2790  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2791 
2792  vector_maxmin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2793  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2794  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2795  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2796  static_cast<unsigned int>(0),
2797  detail::cuda_arg<value_type>(result)
2798  );
2799  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_maxmin_kernel");
2800 }
2801 
2802 
2803 
2809 template<typename NumericT>
2810 void max_cpu(vector_base<NumericT> const & vec1,
2811  NumericT & result)
2812 {
2813  typedef NumericT value_type;
2814 
2815  vcl_size_t work_groups = 128;
2817 
2818  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 4);
2819 
2820  std::vector<value_type> temp_cpu(work_groups);
2821  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2822 
2823  result = temp[0];
2824  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2825  result = std::max(result, *it);
2826 }
2827 
2829 
2835 template<typename NumericT>
2837  scalar<NumericT> & result)
2838 {
2839  typedef NumericT value_type;
2840 
2841  vcl_size_t work_groups = 128;
2843 
2844  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2845 
2846  vector_maxmin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
2847  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2848  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2849  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2850  static_cast<unsigned int>(1),
2851  detail::cuda_arg<value_type>(result)
2852  );
2853  VIENNACL_CUDA_LAST_ERROR_CHECK("vector_maxmin_kernel");
2854 }
2855 
2856 
2857 
2863 template<typename NumericT>
2864 void min_cpu(vector_base<NumericT> const & vec1,
2865  NumericT & result)
2866 {
2867  typedef NumericT value_type;
2868 
2869  vcl_size_t work_groups = 128;
2871 
2872  detail::norm_kernel_launcher<NumericT>::apply(vec1, temp, 3);
2873 
2874  std::vector<value_type> temp_cpu(work_groups);
2875  viennacl::fast_copy(temp.begin(), temp.end(), temp_cpu.begin());
2876 
2877  result = temp[0];
2878  for (typename std::vector<value_type>::const_iterator it = temp_cpu.begin(); it != temp_cpu.end(); ++it)
2879  result = std::min(result, *it);
2880 }
2881 
2882 
2883 
2884 
2886 
2887 
2888 
2889 //index_norm_inf:
2890 
2891 // fixes the problem of not having (f)abs available in a consistent manner
2892 template<typename NumericT>
2893 __device__ NumericT cuda_abs(NumericT val) { return (val < 0) ? -val : val; }
2894 __device__ inline unsigned long cuda_abs(unsigned long val) { return val; }
2895 __device__ inline unsigned int cuda_abs(unsigned int val) { return val; }
2896 __device__ inline unsigned short cuda_abs(unsigned short val) { return val; }
2897 __device__ inline unsigned char cuda_abs(unsigned char val) { return val; }
2898 
2899 template<typename NumericT>
2900 __global__ void index_norm_inf_kernel(const NumericT * vec,
2901  unsigned int start1,
2902  unsigned int inc1,
2903  unsigned int size1,
2904  unsigned int * result)
2905 {
2906  __shared__ NumericT float_buffer[128];
2907  __shared__ unsigned int index_buffer[128];
2908 
2909  float_buffer[threadIdx.x] = 0;
2910  index_buffer[threadIdx.x] = 0;
2911 
2912  //step 1: fill buffer:
2913  NumericT cur_max = NumericT(0);
2914  NumericT tmp;
2915  for (unsigned int i = threadIdx.x; i < size1; i += blockDim.x)
2916  {
2917  tmp = vec[i*inc1+start1];
2918  tmp = cuda_abs(tmp);
2919  if (cur_max < tmp)
2920  {
2921  float_buffer[threadIdx.x] = tmp;
2922  index_buffer[threadIdx.x] = i;
2923  cur_max = tmp;
2924  }
2925  }
2926 
2927  //step 2: parallel reduction:
2928  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
2929  {
2930  __syncthreads();
2931  if (threadIdx.x < stride)
2932  {
2933  //find the first occurring index
2934  if (float_buffer[threadIdx.x] < float_buffer[threadIdx.x+stride])
2935  {
2936  index_buffer[threadIdx.x] = index_buffer[threadIdx.x+stride];
2937  float_buffer[threadIdx.x] = float_buffer[threadIdx.x+stride];
2938  }
2939  }
2940  }
2941 
2942  if (threadIdx.x == 0)
2943  *result = index_buffer[0];
2944 }
2945 
2946 //This function should return a CPU scalar, otherwise statements like
2947 // vcl_rhs[index_norm_inf(vcl_rhs)]
2948 // are ambiguous
2954 template<typename NumericT>
2956 {
2957  typedef NumericT value_type;
2958 
2960  viennacl::backend::memory_create(h, sizeof(unsigned int), viennacl::traits::context(vec1));
2961 
2962  index_norm_inf_kernel<<<1, 128>>>(detail::cuda_arg<value_type>(vec1),
2963  static_cast<unsigned int>(viennacl::traits::start(vec1)),
2964  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
2965  static_cast<unsigned int>(viennacl::traits::size(vec1)),
2966  //detail::cuda_arg<unsigned int>(h.cuda_handle())
2967  reinterpret_cast<unsigned int *>(h.cuda_handle().get())
2968  );
2969  VIENNACL_CUDA_LAST_ERROR_CHECK("index_norm_inf_kernel");
2970 
2971  unsigned int ret = 0;
2972  viennacl::backend::memory_read(h, 0, sizeof(unsigned int), &ret);
2973  return static_cast<vcl_size_t>(ret);
2974 }
2975 
2977 
2978 template<typename NumericT>
2979 __global__ void plane_rotation_kernel(
2980  NumericT * vec1,
2981  unsigned int start1,
2982  unsigned int inc1,
2983  unsigned int size1,
2984  NumericT * vec2,
2985  unsigned int start2,
2986  unsigned int inc2,
2987  unsigned int size2,
2988  NumericT alpha,
2989  NumericT beta)
2990 {
2991  NumericT tmp1 = 0;
2992  NumericT tmp2 = 0;
2993 
2994  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size1; i += blockDim.x * gridDim.x)
2995  {
2996  tmp1 = vec1[i*inc1+start1];
2997  tmp2 = vec2[i*inc2+start2];
2998 
2999  vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2;
3000  vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1;
3001  }
3002 
3003 }
3004 
3014 template<typename NumericT>
3016  vector_base<NumericT> & vec2,
3017  NumericT alpha, NumericT beta)
3018 {
3019  typedef NumericT value_type;
3020 
3021  value_type temporary_alpha = 0;
3023  temporary_alpha = alpha;
3024 
3025  value_type temporary_beta = 0;
3027  temporary_beta = beta;
3028 
3029  plane_rotation_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec1),
3030  static_cast<unsigned int>(viennacl::traits::start(vec1)),
3031  static_cast<unsigned int>(viennacl::traits::stride(vec1)),
3032  static_cast<unsigned int>(viennacl::traits::size(vec1)),
3033  detail::cuda_arg<value_type>(vec2),
3034  static_cast<unsigned int>(viennacl::traits::start(vec2)),
3035  static_cast<unsigned int>(viennacl::traits::stride(vec2)),
3036  static_cast<unsigned int>(viennacl::traits::size(vec2)),
3037  detail::cuda_arg<value_type>(detail::arg_reference(alpha, temporary_alpha)),
3038  detail::cuda_arg<value_type>(detail::arg_reference(beta, temporary_beta)) );
3039  VIENNACL_CUDA_LAST_ERROR_CHECK("plane_rotation_kernel");
3040 }
3041 
3042 } //namespace cuda
3043 } //namespace linalg
3044 } //namespace viennacl
3045 
3046 
3047 #endif
vcl_size_t const_size() const
Definition: vector.hpp:1129
__global__ void vec_element_abs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
Definition: common.hpp:38
void vector_assign(vector_base< NumericT > &vec1, ScalarT1 const &alpha, bool up_to_internal_size=false)
Assign a constant value to a vector (-range/-slice)
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:226
__global__ void vector_sum_kernel_unsigned_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
void norm_2_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^2-norm of a vector - implementation.
__global__ void norm_kernel_floats(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
__global__ void inner_prod_4_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, NumericT *group_results)
__global__ void vec_element_asin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Generic size and resize functionality for different vector and matrix types.
__global__ void plane_rotation_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT alpha, NumericT beta)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
__global__ void inner_prod_3_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, NumericT *group_results)
Various little tools used here and there in ViennaCL.
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
void av(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarType1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
__global__ void vector_multi_sum_kernel(NumericT const *vec1, NumericT *result, unsigned int start_result, unsigned int inc_result)
__global__ void vector_maxmin_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_fabs_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
void max_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
__global__ void norm_kernel_unsigned_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
Determines row and column increments for matrices and matrix proxies.
void memory_read(mem_handle const &src_buffer, vcl_size_t src_offset, vcl_size_t bytes_to_read, void *ptr, bool async=false)
Reads data from a buffer back to main RAM.
Definition: memory.hpp:261
void norm_1_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the l^1-norm of a vector.
T max(const T &lhs, const T &rhs)
Maximum.
Definition: util.hpp:59
#define VIENNACL_MDOT_WORKGROUP_SIZE
An expression template class that represents a binary operation that yields a vector.
Definition: forwards.h:238
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void vec_element_sqrt_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void avbv_v_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
void max_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void vec_element_tanh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
#define VIENNACL_MDOT_WORKGROUP_NUM
void vector_swap(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
Swaps the contents of two vectors, data is copied.
__global__ void index_norm_inf_kernel(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int *result)
__global__ void inner_prod_kernel(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2, NumericT *group_buffer)
__global__ void vec_element_tan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
Definition: forwards.h:447
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
Tuple class holding pointers to multiple vectors. Mainly used as a temporary object returned from vie...
Definition: forwards.h:268
void norm_inf_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the supremum-norm of a vector.
void inner_prod_cpu(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, NumericT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
__global__ void vec_element_atan_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_sum_kernel_floats(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
void avbv_v(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
std::size_t vcl_size_t
Definition: forwards.h:74
__global__ void vec_element_log_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_sum_kernel_integers(const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int option, NumericT *result)
__global__ void vec_element_cosh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
Definition: predicate.hpp:466
__global__ void av_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_acos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vec_element_ceil_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
void min_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the maximum of a vector, first reduction stage on the GPU, second stage on the CPU...
__global__ void norm_kernel_integers(const NumericT *vec, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int norm_selector, NumericT *group_buffer)
void norm_1_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^1-norm of a vector.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vcl_size_t index_norm_inf(vector_base< NumericT > const &vec1)
Computes the index of the first entry that is equal to the supremum-norm in modulus.
__global__ void element_op_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
__global__ void vec_element_exp_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void plane_rotation(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2, NumericT alpha, NumericT beta)
Computes a plane rotation of two vectors.
void min_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the maximum of a vector, both reduction stages run on the GPU.
viennacl::context context(T const &t)
Returns an ID for the currently active memory domain of an object.
Definition: context.hpp:40
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
void inner_prod_impl(vector_base< NumericT > const &vec1, vector_base< NumericT > const &vec2, ScalarT &result)
Computes the inner product of two vectors - implementation. Library users should call inner_prod(vec1...
__global__ void vector_swap_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT *vec2, unsigned int start2, unsigned int inc2)
void norm_inf_cpu(vector_base< NumericT > const &vec1, NumericT &result)
Computes the supremum-norm of a vector.
__global__ void avbv_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *fac2, unsigned int options2, const NumericT *vec2, unsigned int start2, unsigned int inc2, const NumericT *fac3, unsigned int options3, const NumericT *vec3, unsigned int start3, unsigned int inc3)
__global__ void inner_prod_2_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, NumericT *group_results)
__global__ void vec_element_sin_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
VectorType const & const_at(vcl_size_t i) const
Definition: vector.hpp:1132
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
Definition: forwards.h:129
__global__ void vec_element_cos_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
__global__ void vector_assign_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, unsigned int internal_size1, NumericT alpha)
void memory_create(mem_handle &handle, vcl_size_t size_in_bytes, viennacl::context const &ctx, const void *host_ptr=NULL)
Creates an array of the specified size. If the second argument is provided, the buffer is initialized...
Definition: memory.hpp:87
__global__ void vec_element_log10_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
T min(const T &lhs, const T &rhs)
Minimum.
Definition: util.hpp:45
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector_def.hpp:120
iterator end()
Returns an iterator pointing to the end of the vector (STL like)
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
Definition: predicate.hpp:436
__global__ void vec_element_floor_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Definition: forwards.h:133
Implementation of the ViennaCL scalar class.
void norm_2_impl(vector_base< NumericT > const &vec1, scalar< NumericT > &result)
Computes the l^2-norm of a vector - implementation.
void avbv(vector_base< NumericT > &vec1, vector_base< NumericT > const &vec2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, vector_base< NumericT > const &vec3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
__global__ void inner_prod_8_kernel(const NumericT *x, unsigned int startx, unsigned int stridex, unsigned int sizex, const NumericT *y0, unsigned int start0, unsigned int stride0, const NumericT *y1, unsigned int start1, unsigned int stride1, const NumericT *y2, unsigned int start2, unsigned int stride2, const NumericT *y3, unsigned int start3, unsigned int stride3, const NumericT *y4, unsigned int start4, unsigned int stride4, const NumericT *y5, unsigned int start5, unsigned int stride5, const NumericT *y6, unsigned int start6, unsigned int stride6, const NumericT *y7, unsigned int start7, unsigned int stride7, NumericT *group_results)
__global__ void vec_element_sinh_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
Definition: common.hpp:137
__device__ NumericT cuda_abs(NumericT val)
Simple enable-if variant that uses the SFINAE pattern.
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91
__global__ void element_op_int_kernel(NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, NumericT const *vec2, unsigned int start2, unsigned int inc2, NumericT const *vec3, unsigned int start3, unsigned int inc3, unsigned int op_type)
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)