ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_OPENCL_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include <cmath>
26 
27 #include "viennacl/forwards.h"
29 #include "viennacl/ocl/device.hpp"
30 #include "viennacl/ocl/handle.hpp"
31 #include "viennacl/ocl/kernel.hpp"
32 #include "viennacl/scalar.hpp"
33 #include "viennacl/tools/tools.hpp"
39 #include "viennacl/traits/size.hpp"
43 
44 namespace viennacl
45 {
46 namespace linalg
47 {
48 namespace opencl
49 {
50 
51 template<typename NumericT>
53  NumericT alpha,
56  vector_base<NumericT> const & Ap,
57  NumericT beta,
58  vector_base<NumericT> & inner_prod_buffer)
59 {
60  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
62 
64  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
65 
66  k.local_work_size(0, 128);
67  k.global_work_size(0, 128*128);
68 
70  {
71  k.local_work_size(0, 256);
72  k.global_work_size(0, 256*256);
73  }
74 
75  viennacl::ocl::enqueue(k(result, alpha, p, r, Ap, beta, inner_prod_buffer, vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
76 }
77 
78 template<typename NumericT>
80  vector_base<NumericT> const & p,
82  vector_base<NumericT> & inner_prod_buffer)
83 {
84  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
86 
88 
89  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
90  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
91 
92  k.local_work_size(0, 128);
93  k.global_work_size(0, 128*128);
94 
96  {
97  k.local_work_size(0, 256);
98  k.global_work_size(0, 256*256);
99  }
100 
101  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
102  p,
103  Ap,
104  vec_size,
105  inner_prod_buffer,
106  buffer_size_per_vector,
107  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
108  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
109  viennacl::ocl::local_mem(1024 * sizeof(NumericT))
110  ));
111 
112 }
113 
114 template<typename NumericT>
116  vector_base<NumericT> const & p,
118  vector_base<NumericT> & inner_prod_buffer)
119 {
120  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
122 
123  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
124  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
125 
126  Ap.clear();
127 
129  unsigned int thread_num = 256; //k.local_work_size(0);
130 
131  k.local_work_size(0, thread_num);
132 
133  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
134 
135  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
136  p,
137  Ap,
138  vec_size,
139  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
140  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
141  inner_prod_buffer,
142  buffer_size_per_vector,
143  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
144  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
145  ));
146 }
147 
148 template<typename NumericT>
150  vector_base<NumericT> const & p,
152  vector_base<NumericT> & inner_prod_buffer)
153 {
154  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
156 
157  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
158  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
159 
161 
162  unsigned int thread_num = 128;
163  unsigned int group_num = 256;
164 
165  k.local_work_size(0, thread_num);
166  k.global_work_size(0, thread_num * group_num);
167 
169  {
170  k.local_work_size(0, 256);
171  k.global_work_size(0, 256*256);
172  }
173 
174  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
175  A.handle().opencl_handle(),
176  cl_uint(A.internal_size1()),
177  cl_uint(A.maxnnz()),
178  cl_uint(A.internal_maxnnz()),
179  viennacl::traits::opencl_handle(p),
180  viennacl::traits::opencl_handle(Ap),
181  vec_size,
182  inner_prod_buffer,
183  buffer_size_per_vector,
184  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
185  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
186  )
187  );
188 }
189 
190 template<typename NumericT>
192  vector_base<NumericT> const & p,
194  vector_base<NumericT> & inner_prod_buffer)
195 {
196  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
198 
199  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
200  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
201 
203 
204  unsigned int thread_num = static_cast<unsigned int>(A.rows_per_block());
205  unsigned int group_num = 256;
206 
207  k.local_work_size(0, thread_num);
208  k.global_work_size(0, thread_num * group_num);
209 
210  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
211  A.handle2().opencl_handle(),
212  A.handle3().opencl_handle(),
213  A.handle().opencl_handle(),
214  viennacl::traits::opencl_handle(p),
215  viennacl::traits::opencl_handle(Ap),
216  vec_size,
217  inner_prod_buffer,
218  buffer_size_per_vector,
219  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
220  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
221  )
222  );
223 }
224 
225 
226 template<typename NumericT>
228  vector_base<NumericT> const & p,
230  vector_base<NumericT> & inner_prod_buffer)
231 {
232  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
234 
235  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
236  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
237 
239 
240  unsigned int thread_num = 128;
241  unsigned int group_num = 128;
242 
243  k.local_work_size(0, thread_num);
244  k.global_work_size(0, thread_num * group_num);
245 
247  {
248  k.local_work_size(0, 256);
249  k.global_work_size(0, 256*256);
250  }
251 
252  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
253  A.handle().opencl_handle(),
254  A.handle3().opencl_handle(),
255  A.handle4().opencl_handle(),
256  A.handle5().opencl_handle(),
257  cl_uint(A.internal_size1()),
258  cl_uint(A.ell_nnz()),
259  cl_uint(A.internal_ellnnz()),
260  viennacl::traits::opencl_handle(p),
261  viennacl::traits::opencl_handle(Ap),
262  vec_size,
263  inner_prod_buffer,
264  buffer_size_per_vector,
265  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
266  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
267  )
268  );
269 }
270 
271 
273 
274 template<typename NumericT>
277  vector_base<NumericT> const & Ap,
278  vector_base<NumericT> & inner_prod_buffer,
279  vcl_size_t buffer_chunk_size,
280  vcl_size_t buffer_chunk_offset)
281 {
282  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
284 
286  cl_uint vec_size = cl_uint(viennacl::traits::size(s));
287 
288  k.local_work_size(0, 128);
289  k.global_work_size(0, 128*128);
290 
292  {
293  k.local_work_size(0, 256);
294  k.global_work_size(0, 256*256);
295  }
296 
297  cl_uint chunk_size = cl_uint(buffer_chunk_size);
298  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
299  viennacl::ocl::enqueue(k(s, r, Ap,
300  inner_prod_buffer, chunk_size, chunk_offset, vec_size,
301  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
302  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))));
303 }
304 
305 template<typename NumericT>
306 void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
307  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
308  NumericT beta, vector_base<NumericT> const & Ap,
309  vector_base<NumericT> const & r0star,
310  vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
311 {
312  (void)buffer_chunk_size;
313 
314  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(s).context());
316 
318  cl_uint vec_size = cl_uint(viennacl::traits::size(result));
319 
320  k.local_work_size(0, 128);
321  k.global_work_size(0, 128*128);
322 
324  {
325  k.local_work_size(0, 256);
326  k.global_work_size(0, 256*256);
327  }
328 
329  viennacl::ocl::enqueue(k(result, alpha, p, omega, s,
330  residual, As,
331  beta, Ap,
332  r0star,
333  inner_prod_buffer,
334  vec_size, viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
335  )
336  );
337 }
338 
339 template<typename NumericT>
341  vector_base<NumericT> const & p,
343  vector_base<NumericT> const & r0star,
344  vector_base<NumericT> & inner_prod_buffer,
345  vcl_size_t buffer_chunk_size,
346  vcl_size_t buffer_chunk_offset)
347 {
348  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
350 
352 
353  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
354  cl_uint chunk_size = cl_uint(buffer_chunk_size);
355  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
356 
357  k.local_work_size(0, 128);
358  k.global_work_size(0, 128*128);
359 
361  {
362  k.local_work_size(0, 256);
363  k.global_work_size(0, 256*256);
364  }
365 
366  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
367  p,
368  Ap,
369  r0star,
370  vec_size,
371  inner_prod_buffer, chunk_size, chunk_offset,
372  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
373  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
374  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
375  ));
376 
377 }
378 
379 
380 template<typename NumericT>
382  vector_base<NumericT> const & p,
384  vector_base<NumericT> const & r0star,
385  vector_base<NumericT> & inner_prod_buffer,
386  vcl_size_t buffer_chunk_size,
387  vcl_size_t buffer_chunk_offset)
388 {
389  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
391 
392  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
393  cl_uint chunk_size = cl_uint(buffer_chunk_size);
394  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
395 
396  Ap.clear();
397 
399  unsigned int thread_num = 256; //k.local_work_size(0);
400 
401  k.local_work_size(0, thread_num);
402 
403  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
404 
405  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
406  p,
407  Ap,
408  r0star,
409  vec_size,
410  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
411  viennacl::ocl::local_mem(sizeof(NumericT)*thread_num),
412  inner_prod_buffer, chunk_size, chunk_offset,
413  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
414  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
415  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
416  ));
417 }
418 
419 template<typename NumericT>
421  vector_base<NumericT> const & p,
423  vector_base<NumericT> const & r0star,
424  vector_base<NumericT> & inner_prod_buffer,
425  vcl_size_t buffer_chunk_size,
426  vcl_size_t buffer_chunk_offset)
427 {
428  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
430 
431  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
432  cl_uint chunk_size = cl_uint(buffer_chunk_size);
433  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
434 
436 
437  unsigned int thread_num = 128;
438  unsigned int group_num = 128;
439 
440  k.local_work_size(0, thread_num);
441  k.global_work_size(0, thread_num * group_num);
442 
444  {
445  k.local_work_size(0, 256);
446  k.global_work_size(0, 256*256);
447  }
448 
449  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
450  A.handle().opencl_handle(),
451  cl_uint(A.internal_size1()),
452  cl_uint(A.maxnnz()),
453  cl_uint(A.internal_maxnnz()),
454  viennacl::traits::opencl_handle(p),
455  viennacl::traits::opencl_handle(Ap),
456  r0star,
457  vec_size,
458  inner_prod_buffer, chunk_size, chunk_offset,
459  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
460  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
461  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
462  )
463  );
464 }
465 
466 template<typename NumericT>
468  vector_base<NumericT> const & p,
470  vector_base<NumericT> const & r0star,
471  vector_base<NumericT> & inner_prod_buffer,
472  vcl_size_t buffer_chunk_size,
473  vcl_size_t buffer_chunk_offset)
474 {
475  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
477 
478  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
479  cl_uint chunk_size = cl_uint(buffer_chunk_size);
480  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
481 
483 
484  unsigned int thread_num = static_cast<unsigned int>(A.rows_per_block());
485  unsigned int group_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
486 
487  k.local_work_size(0, thread_num);
488  k.global_work_size(0, thread_num * group_num);
489 
490  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
491  A.handle2().opencl_handle(),
492  A.handle3().opencl_handle(),
493  A.handle().opencl_handle(),
494  viennacl::traits::opencl_handle(p),
495  viennacl::traits::opencl_handle(Ap),
496  r0star,
497  vec_size,
498  inner_prod_buffer, chunk_size, chunk_offset,
499  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
500  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
501  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
502  )
503  );
504 }
505 
506 
507 template<typename NumericT>
509  vector_base<NumericT> const & p,
511  vector_base<NumericT> const & r0star,
512  vector_base<NumericT> & inner_prod_buffer,
513  vcl_size_t buffer_chunk_size,
514  vcl_size_t buffer_chunk_offset)
515 {
516  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
518 
519  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
520  cl_uint chunk_size = cl_uint(buffer_chunk_size);
521  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
522 
524 
525  unsigned int thread_num = 256;
526  unsigned int group_num = 128;
527 
528  k.local_work_size(0, thread_num);
529  k.global_work_size(0, thread_num * group_num);
530 
532  {
533  k.local_work_size(0, 256);
534  k.global_work_size(0, 256*256);
535  }
536 
537  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
538  A.handle().opencl_handle(),
539  A.handle3().opencl_handle(),
540  A.handle4().opencl_handle(),
541  A.handle5().opencl_handle(),
542  cl_uint(A.internal_size1()),
543  cl_uint(A.ell_nnz()),
544  cl_uint(A.internal_ellnnz()),
545  viennacl::traits::opencl_handle(p),
546  viennacl::traits::opencl_handle(Ap),
547  r0star,
548  vec_size,
549  inner_prod_buffer, chunk_size, chunk_offset,
550  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
551  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT)),
552  viennacl::ocl::local_mem(k.local_work_size() * sizeof(NumericT))
553  )
554  );
555 }
556 
558 
566 template <typename T>
568  vector_base<T> const & residual,
569  vector_base<T> & R_buffer,
570  vcl_size_t offset_in_R,
571  vector_base<T> const & inner_prod_buffer,
572  vector_base<T> & r_dot_vk_buffer,
573  vcl_size_t buffer_chunk_size,
574  vcl_size_t buffer_chunk_offset)
575 {
576  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(v_k).context());
578 
580 
581  k.local_work_size(0, 128);
582  k.global_work_size(0, 128*128);
583 
584  cl_uint size_vk = cl_uint(v_k.size());
585  cl_uint vk_offset = cl_uint(viennacl::traits::start(v_k));
586  cl_uint R_offset = cl_uint(offset_in_R);
587  cl_uint chunk_size = cl_uint(buffer_chunk_size);
588  cl_uint chunk_offset = cl_uint(buffer_chunk_offset);
589  viennacl::ocl::enqueue(k(v_k, vk_offset,
590  residual,
591  R_buffer, R_offset,
592  inner_prod_buffer, chunk_size,
593  r_dot_vk_buffer, chunk_offset,
594  size_vk,
596  ));
597 }
598 
599 template <typename T>
600 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
601  vcl_size_t v_k_size,
602  vcl_size_t v_k_internal_size,
603  vcl_size_t param_k,
604  vector_base<T> & vi_in_vk_buffer,
605  vcl_size_t buffer_chunk_size)
606 {
607  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
609 
611 
612  k.local_work_size(0, 128);
613  k.global_work_size(0, 128*128);
614 
615  cl_uint size_vk = cl_uint(v_k_size);
616  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
617  cl_uint ocl_k = cl_uint(param_k);
618  cl_uint chunk_size = cl_uint(buffer_chunk_size);
619  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
620  vi_in_vk_buffer, chunk_size
621  ));
622 }
623 
624 template <typename T>
626  vcl_size_t v_k_size,
627  vcl_size_t v_k_internal_size,
628  vcl_size_t param_k,
629  vector_base<T> const & vi_in_vk_buffer,
630  vector_base<T> & R_buffer,
631  vcl_size_t krylov_dim,
632  vector_base<T> & inner_prod_buffer,
633  vcl_size_t buffer_chunk_size)
634 {
635  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(device_krylov_basis).context());
637 
639 
640  k.local_work_size(0, 128);
641  k.global_work_size(0, 128*128);
642 
643  cl_uint size_vk = cl_uint(v_k_size);
644  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
645  cl_uint ocl_k = cl_uint(param_k);
646  cl_uint chunk_size = cl_uint(buffer_chunk_size);
647  cl_uint ocl_krylov_dim = cl_uint(krylov_dim);
648  viennacl::ocl::enqueue(k(device_krylov_basis, size_vk, internal_size_vk, ocl_k,
649  vi_in_vk_buffer, chunk_size,
650  R_buffer, ocl_krylov_dim,
651  inner_prod_buffer,
652  viennacl::ocl::local_mem(7 * k.local_work_size() * sizeof(T))
653  ));
654 }
655 
656 template <typename T>
658  vector_base<T> const & residual,
659  vector_base<T> const & krylov_basis,
660  vcl_size_t v_k_size,
661  vcl_size_t v_k_internal_size,
662  vector_base<T> const & coefficients,
663  vcl_size_t param_k)
664 {
665  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(result).context());
667 
669 
670  k.local_work_size(0, 128);
671  k.global_work_size(0, 128*128);
672 
673  cl_uint size_vk = cl_uint(v_k_size);
674  cl_uint internal_size_vk = cl_uint(v_k_internal_size);
675  cl_uint ocl_k = cl_uint(param_k);
676  viennacl::ocl::enqueue(k(result,
677  residual,
678  krylov_basis, size_vk, internal_size_vk,
679  coefficients, ocl_k
680  ));
681 }
682 
683 
684 template <typename T>
686  vector_base<T> const & p,
687  vector_base<T> & Ap,
688  vector_base<T> & inner_prod_buffer)
689 {
690  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
692 
694 
695  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
696  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
697  cl_uint start_p = cl_uint(viennacl::traits::start(p));
698  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
699 
700  k.local_work_size(0, 128);
701  k.global_work_size(0, 128*128);
702 
704  {
705  k.local_work_size(0, 256);
706  k.global_work_size(0, 256*128);
707  }
708 
709  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(), A.handle2().opencl_handle(), A.handle3().opencl_handle(), A.handle().opencl_handle(), cl_uint(A.blocks1()),
710  p, start_p,
711  Ap, start_Ap,
712  vec_size,
713  inner_prod_buffer,
714  buffer_size_per_vector,
715  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
716  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
717  viennacl::ocl::local_mem(1024 * sizeof(T))
718  ));
719 
720 }
721 
722 template <typename T>
724  vector_base<T> const & p,
725  vector_base<T> & Ap,
726  vector_base<T> & inner_prod_buffer)
727 {
728  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
730 
731  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
732  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
733  cl_uint start_p = cl_uint(viennacl::traits::start(p));
734  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
735 
736  Ap.clear();
737  inner_prod_buffer.clear();
738 
740  unsigned int thread_num = 128; //k.local_work_size(0);
741 
742  k.local_work_size(0, thread_num);
743 
744  k.global_work_size(0, 64 * thread_num); //64 work groups are hard-coded for now. Gives reasonable performance in most cases
745 
746  viennacl::ocl::enqueue(k(A.handle12().opencl_handle(), A.handle().opencl_handle(), A.handle3().opencl_handle(),
747  p, start_p,
748  Ap, start_Ap,
749  vec_size,
750  viennacl::ocl::local_mem(sizeof(cl_uint)*thread_num),
751  viennacl::ocl::local_mem(sizeof(T)*thread_num),
752  inner_prod_buffer,
753  buffer_size_per_vector,
754  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
756  ));
757 }
758 
759 template <typename T>
761  vector_base<T> const & p,
762  vector_base<T> & Ap,
763  vector_base<T> & inner_prod_buffer)
764 {
765  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
767 
768  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
769  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
770  cl_uint start_p = cl_uint(viennacl::traits::start(p));
771  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
772 
774 
775  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
776  unsigned int group_num = 128;
777 
778  k.local_work_size(0, thread_num);
779  k.global_work_size(0, thread_num * group_num);
780 
781  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
782  A.handle().opencl_handle(),
783  cl_uint(A.internal_size1()),
784  cl_uint(A.maxnnz()),
785  cl_uint(A.internal_maxnnz()),
786  viennacl::traits::opencl_handle(p), start_p,
787  viennacl::traits::opencl_handle(Ap), start_Ap,
788  vec_size,
789  inner_prod_buffer,
790  buffer_size_per_vector,
791  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
793  )
794  );
795 }
796 
797 template <typename T>
799  vector_base<T> const & p,
800  vector_base<T> & Ap,
801  vector_base<T> & inner_prod_buffer)
802 {
803  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
805 
806  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
807  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
808  cl_uint start_p = cl_uint(viennacl::traits::start(p));
809  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
810 
812 
813  unsigned int thread_num = static_cast<unsigned int>(A.rows_per_block());
814  unsigned int group_num = 128;
815 
816  k.local_work_size(0, thread_num);
817  k.global_work_size(0, thread_num * group_num);
818 
819  viennacl::ocl::enqueue(k(A.handle1().opencl_handle(),
820  A.handle2().opencl_handle(),
821  A.handle3().opencl_handle(),
822  A.handle().opencl_handle(),
823  viennacl::traits::opencl_handle(p), start_p,
824  viennacl::traits::opencl_handle(Ap), start_Ap,
825  vec_size,
826  inner_prod_buffer,
827  buffer_size_per_vector,
828  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
830  )
831  );
832 }
833 
834 
835 template <typename T>
837  vector_base<T> const & p,
838  vector_base<T> & Ap,
839  vector_base<T> & inner_prod_buffer)
840 {
841  viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
843 
844  cl_uint vec_size = cl_uint(viennacl::traits::size(p));
845  cl_uint buffer_size_per_vector = cl_uint(inner_prod_buffer.size()) / cl_uint(3);
846  cl_uint start_p = cl_uint(viennacl::traits::start(p));
847  cl_uint start_Ap = cl_uint(viennacl::traits::start(Ap));
848 
850 
851  unsigned int thread_num = (ctx.current_device().vendor_id() == viennacl::ocl::nvidia_id) ? 256 : 128;
852  unsigned int group_num = 128;
853 
854  k.local_work_size(0, thread_num);
855  k.global_work_size(0, thread_num * group_num);
856 
857 
858  viennacl::ocl::enqueue(k(A.handle2().opencl_handle(),
859  A.handle().opencl_handle(),
860  A.handle3().opencl_handle(),
861  A.handle4().opencl_handle(),
862  A.handle5().opencl_handle(),
863  cl_uint(A.internal_size1()),
864  cl_uint(A.ell_nnz()),
865  cl_uint(A.internal_ellnnz()),
866  viennacl::traits::opencl_handle(p), start_p,
867  viennacl::traits::opencl_handle(Ap), start_Ap,
868  vec_size,
869  inner_prod_buffer,
870  buffer_size_per_vector,
871  viennacl::ocl::local_mem(k.local_work_size() * sizeof(T)),
873  )
874  );
875 }
876 
877 
878 } //namespace opencl
879 } //namespace linalg
880 } //namespace viennacl
881 
882 
883 #endif
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
viennacl::ocl::device const & current_device() const
Returns the current device.
Definition: context.hpp:111
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
Definition: iterative.hpp:1364
handle_type & handle2()
Definition: ell_matrix.hpp:103
Represents an OpenCL device within ViennaCL.
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Generic size and resize functionality for different vector and matrix types.
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
Represents an OpenCL kernel within ViennaCL.
Definition: kernel.hpp:58
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
Various little tools used here and there in ViennaCL.
static void init(viennacl::ocl::context &ctx)
Definition: iterative.hpp:1371
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
size_type local_work_size(int index=0) const
Returns the local work size at the respective dimension.
Definition: kernel.hpp:742
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
This file provides the forward declarations for the main types used within ViennaCL.
Determines row and column increments for matrices and matrix proxies.
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
cl_uint vendor_id() const
A unique device vendor identifier. An example of a unique device identifier could be the PCIe ID...
Definition: device.hpp:917
vcl_size_t rows_per_block() const
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
Common implementations shared by OpenCL-based operations.
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
A class representing local (shared) OpenCL memory. Typically used as kernel argument.
Definition: local_mem.hpp:33
OpenCL kernel file for specialized iterative solver kernels.
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
viennacl::ocl::kernel & get_kernel(std::string const &program_name, std::string const &kernel_name)
Convenience function for retrieving the kernel of a program directly from the context.
Definition: context.hpp:607
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
Implementation of a smart-pointer-like class for handling OpenCL handles.
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
std::size_t vcl_size_t
Definition: forwards.h:74
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
handle_type & handle()
Definition: ell_matrix.hpp:100
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:861
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
void enqueue(KernelType &k, viennacl::ocl::command_queue const &queue)
Enqueues a kernel in the provided queue.
Definition: enqueue.hpp:50
Representation of an OpenCL kernel in ViennaCL.
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
size_type global_work_size(int index=0) const
Returns the global work size at the respective dimension.
Definition: kernel.hpp:751
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
Forward declarations of the implicit_vector_base, vector_base class.
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
Simple enable-if variant that uses the SFINAE pattern.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...