ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
iterative_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
31 namespace viennacl
32 {
33 namespace linalg
34 {
35 namespace cuda
36 {
37 
38 //
39 // CG vector update:
40 //
41 
42 // cpu scalar
43 template<typename NumericT>
44 __global__ void pipelined_cg_vector_kernel(NumericT * result,
45  NumericT alpha,
46  NumericT * p,
47  NumericT * r,
48  NumericT const * Ap,
49  NumericT beta,
50  NumericT * inner_prod_buffer,
51  unsigned int size)
52 {
53  NumericT inner_prod_contrib = 0;
54  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
55  {
56  NumericT value_p = p[i];
57  NumericT value_r = r[i];
58 
59  result[i] += alpha * value_p;
60  value_r -= alpha * Ap[i];
61  value_p = value_r + beta * value_p;
62 
63  p[i] = value_p;
64  r[i] = value_r;
65  inner_prod_contrib += value_r * value_r;
66  }
67 
68  // parallel reduction in work group
69  __shared__ NumericT shared_array[256];
70  shared_array[threadIdx.x] = inner_prod_contrib;
71  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
72  {
73  __syncthreads();
74  if (threadIdx.x < stride)
75  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
76  }
77 
78  // write results to result array
79  if (threadIdx.x == 0)
80  inner_prod_buffer[blockIdx.x] = shared_array[0];
81 }
82 
83 
84 template<typename NumericT>
86  NumericT alpha,
89  vector_base<NumericT> const & Ap,
90  NumericT beta,
91  vector_base<NumericT> & inner_prod_buffer)
92 {
93  unsigned int size = result.size();
94  pipelined_cg_vector_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(result),
95  alpha,
96  detail::cuda_arg<NumericT>(p),
97  detail::cuda_arg<NumericT>(r),
98  detail::cuda_arg<NumericT>(Ap),
99  beta,
100  detail::cuda_arg<NumericT>(inner_prod_buffer),
101  size);
102  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_vector_kernel");
103 }
104 
105 
106 
107 
108 //
109 // Compressed matrix
110 //
111 
112 
113 template<typename NumericT>
115  const unsigned int * row_indices,
116  const unsigned int * column_indices,
117  const unsigned int * row_blocks,
118  const NumericT * elements,
119  unsigned int num_blocks,
120  const NumericT * p,
121  NumericT * Ap,
122  unsigned int size,
123  NumericT * inner_prod_buffer,
124  unsigned int buffer_size)
125 {
126  NumericT inner_prod_ApAp = 0;
127  NumericT inner_prod_pAp = 0;
128 
129  __shared__ NumericT shared_elements[1024];
130 
131  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
132  {
133  unsigned int row_start = row_blocks[block_id];
134  unsigned int row_stop = row_blocks[block_id + 1];
135  unsigned int element_start = row_indices[row_start];
136  unsigned int element_stop = row_indices[row_stop];
137  unsigned int rows_to_process = row_stop - row_start;
138 
139  if (rows_to_process > 1) // CSR stream with one thread per row
140  {
141  // load to shared buffer:
142  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
143  shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
144 
145  __syncthreads();
146 
147  // use one thread per row to sum:
148  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
149  {
150  NumericT dot_prod = 0;
151  unsigned int thread_row_start = row_indices[row] - element_start;
152  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
153  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
154  dot_prod += shared_elements[i];
155  Ap[row] = dot_prod;
156  inner_prod_ApAp += dot_prod * dot_prod;
157  inner_prod_pAp += p[row] * dot_prod;
158  }
159  }
160  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
161  else // CSR vector for a single row
162  {
163  // load and sum to shared buffer:
164  shared_elements[threadIdx.x] = 0;
165  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
166  shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
167 
168  // reduction to obtain final result
169  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
170  {
171  __syncthreads();
172  if (threadIdx.x < stride)
173  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
174  }
175 
176  if (threadIdx.x == 0)
177  {
178  Ap[row_start] = shared_elements[0];
179  inner_prod_ApAp += shared_elements[0] * shared_elements[0];
180  inner_prod_pAp += p[row_start] * shared_elements[0];
181  }
182  }
183 
184  __syncthreads(); // avoid race conditions
185  }
186 
188  __shared__ NumericT shared_array_ApAp[256];
189  __shared__ NumericT shared_array_pAp[256];
190  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
191  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
192  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
193  {
194  __syncthreads();
195  if (threadIdx.x < stride)
196  {
197  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
198  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
199  }
200  }
201 
202  // write results to result array
203  if (threadIdx.x == 0) {
204  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
205  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
206  }
207 }
208 
209 
210 
211 
212 template<typename NumericT>
214  vector_base<NumericT> const & p,
216  vector_base<NumericT> & inner_prod_buffer)
217 {
218  unsigned int size = p.size();
219  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
220 
221  pipelined_cg_csr_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
222  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
223  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
224  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
225  static_cast<unsigned int>(A.blocks1()),
226  detail::cuda_arg<NumericT>(p),
227  detail::cuda_arg<NumericT>(Ap),
228  size,
229  detail::cuda_arg<NumericT>(inner_prod_buffer),
230  buffer_size_per_vector);
231  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
232 }
233 
234 
235 //
236 // Coordinate Matrix
237 //
238 
239 
240 template<typename NumericT>
241 __global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
242  const NumericT * elements,
243  const unsigned int * group_boundaries,
244  const NumericT * p,
245  NumericT * Ap,
246  unsigned int size,
247  NumericT * inner_prod_buffer,
248  unsigned int buffer_size)
249 {
250  NumericT inner_prod_ApAp = 0;
251  NumericT inner_prod_pAp = 0;
252  __shared__ unsigned int shared_rows[128];
253  __shared__ NumericT inter_results[128];
254 
255  uint2 tmp;
256  NumericT val;
257  unsigned int group_start = group_boundaries[blockIdx.x];
258  unsigned int group_end = group_boundaries[blockIdx.x + 1];
259  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
260 
261  unsigned int local_index = 0;
262 
263  for (unsigned int k = 0; k < k_end; ++k)
264  {
265  local_index = group_start + k * blockDim.x + threadIdx.x;
266 
267  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
268  val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
269 
270  //check for carry from previous loop run:
271  if (threadIdx.x == 0 && k > 0)
272  {
273  if (tmp.x == shared_rows[blockDim.x-1])
274  val += inter_results[blockDim.x-1];
275  else
276  {
277  NumericT Ap_entry = inter_results[blockDim.x-1];
278  Ap[shared_rows[blockDim.x-1]] = Ap_entry;
279  inner_prod_ApAp += Ap_entry * Ap_entry;
280  inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
281  }
282  }
283 
284  //segmented parallel reduction begin
285  __syncthreads();
286  shared_rows[threadIdx.x] = tmp.x;
287  inter_results[threadIdx.x] = val;
288  NumericT left = 0;
289  __syncthreads();
290 
291  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
292  {
293  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
294  __syncthreads();
295  inter_results[threadIdx.x] += left;
296  __syncthreads();
297  }
298  //segmented parallel reduction end
299 
300  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
301  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
302  {
303  NumericT Ap_entry = inter_results[threadIdx.x];
304  Ap[tmp.x] = Ap_entry;
305  inner_prod_ApAp += Ap_entry * Ap_entry;
306  inner_prod_pAp += Ap_entry * p[tmp.x];
307  }
308 
309  __syncthreads();
310  } //for k
311 
312  if (local_index + 1 == group_end)
313  {
314  NumericT Ap_entry = inter_results[threadIdx.x];
315  Ap[tmp.x] = Ap_entry;
316  inner_prod_ApAp += Ap_entry * Ap_entry;
317  inner_prod_pAp += Ap_entry * p[tmp.x];
318  }
319 
321  __shared__ NumericT shared_array_ApAp[256];
322  __shared__ NumericT shared_array_pAp[256];
323  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
324  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
325  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
326  {
327  __syncthreads();
328  if (threadIdx.x < stride)
329  {
330  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
331  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
332  }
333  }
334 
335  // write results to result array
336  if (threadIdx.x == 0) {
337  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
338  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
339  }
340 
341 }
342 
343 
344 template<typename NumericT>
346  vector_base<NumericT> const & p,
348  vector_base<NumericT> & inner_prod_buffer)
349 {
350  unsigned int size = p.size();
351  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
352 
353  Ap.clear();
354 
355  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.handle12().cuda_handle()),
356  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
357  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
358  detail::cuda_arg<NumericT>(p),
359  detail::cuda_arg<NumericT>(Ap),
360  size,
361  detail::cuda_arg<NumericT>(inner_prod_buffer),
362  buffer_size_per_vector);
363  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
364 }
365 
366 
367 
368 //
369 // ELL Matrix
370 //
371 
372 template<typename NumericT>
373 __global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int * coords,
374  const NumericT * elements,
375  unsigned int internal_row_num,
376  unsigned int items_per_row,
377  const NumericT * p,
378  NumericT * Ap,
379  unsigned int size,
380  NumericT * inner_prod_buffer,
381  unsigned int buffer_size)
382 {
383  NumericT inner_prod_ApAp = 0;
384  NumericT inner_prod_pAp = 0;
385  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
386  unsigned int glb_sz = gridDim.x * blockDim.x;
387 
388  for (unsigned int row = glb_id; row < size; row += glb_sz)
389  {
390  NumericT sum = 0;
391 
392  unsigned int offset = row;
393  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
394  {
395  NumericT val = elements[offset];
396  sum += val ? p[coords[offset]] * val : NumericT(0);
397  }
398 
399  Ap[row] = sum;
400  inner_prod_ApAp += sum * sum;
401  inner_prod_pAp += sum * p[row];
402  }
403 
405  __shared__ NumericT shared_array_ApAp[256];
406  __shared__ NumericT shared_array_pAp[256];
407  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
408  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
409  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
410  {
411  __syncthreads();
412  if (threadIdx.x < stride)
413  {
414  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
415  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
416  }
417  }
418 
419  // write results to result array
420  if (threadIdx.x == 0) {
421  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
422  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
423  }
424 }
425 
426 
427 template<typename NumericT>
429  vector_base<NumericT> const & p,
431  vector_base<NumericT> & inner_prod_buffer)
432 {
433  unsigned int size = p.size();
434  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
435 
436  pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
437  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
438  static_cast<unsigned int>(A.internal_size1()),
439  static_cast<unsigned int>(A.maxnnz()),
440  detail::cuda_arg<NumericT>(p),
441  detail::cuda_arg<NumericT>(Ap),
442  size,
443  detail::cuda_arg<NumericT>(inner_prod_buffer),
444  buffer_size_per_vector);
445  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
446 }
447 
448 
449 //
450 // SELL-C-\sigma Matrix
451 //
452 
453 template<typename NumericT>
454 __global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
455  const unsigned int * column_indices,
456  const unsigned int * block_start,
457  const NumericT * elements,
458  const NumericT * p,
459  NumericT * Ap,
460  unsigned int size,
461  NumericT * inner_prod_buffer,
462  unsigned int buffer_size)
463 {
464  NumericT inner_prod_ApAp = 0;
465  NumericT inner_prod_pAp = 0;
466  unsigned int local_id = threadIdx.x;
467  unsigned int local_size = blockDim.x;
468 
469  for (unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
470  {
471  unsigned int row = block_idx * local_size + local_id;
472  unsigned int offset = block_start[block_idx];
473  unsigned int num_columns = columns_per_block[block_idx];
474 
475  NumericT sum = 0;
476  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
477  {
478  unsigned int index = offset + item_id * local_size + local_id;
479  NumericT val = elements[index];
480 
481  sum += val ? (p[column_indices[index]] * val) : 0;
482  }
483 
484  if (row < size)
485  {
486  Ap[row] = sum;
487  inner_prod_ApAp += sum * sum;
488  inner_prod_pAp += sum * p[row];
489  }
490  }
491 
493  __shared__ NumericT shared_array_ApAp[256];
494  __shared__ NumericT shared_array_pAp[256];
495  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
496  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
497  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
498  {
499  __syncthreads();
500  if (threadIdx.x < stride)
501  {
502  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
503  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
504  }
505  }
506 
507  // write results to result array
508  if (threadIdx.x == 0) {
509  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
510  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
511  }
512 }
513 
514 template<typename NumericT>
516  vector_base<NumericT> const & p,
518  vector_base<NumericT> & inner_prod_buffer)
519 {
520  unsigned int size = p.size();
521  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
522 
523  pipelined_cg_sliced_ell_vec_mul_kernel<<<256, A.rows_per_block()>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
524  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
525  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
526  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
527  detail::cuda_arg<NumericT>(p),
528  detail::cuda_arg<NumericT>(Ap),
529  size,
530  detail::cuda_arg<NumericT>(inner_prod_buffer),
531  buffer_size_per_vector);
532  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
533 }
534 
535 
536 //
537 // Hybrid Matrix
538 //
539 
540 
541 template<typename NumericT>
542 __global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int * ell_coords,
543  const NumericT * ell_elements,
544  const unsigned int * csr_rows,
545  const unsigned int * csr_cols,
546  const NumericT * csr_elements,
547  unsigned int internal_row_num,
548  unsigned int items_per_row,
549  const NumericT * p,
550  NumericT * Ap,
551  unsigned int size,
552  NumericT * inner_prod_buffer,
553  unsigned int buffer_size)
554 {
555  NumericT inner_prod_ApAp = 0;
556  NumericT inner_prod_pAp = 0;
557  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
558  unsigned int glb_sz = gridDim.x * blockDim.x;
559 
560  for (unsigned int row = glb_id; row < size; row += glb_sz)
561  {
562  NumericT sum = 0;
563 
564  unsigned int offset = row;
565  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
566  {
567  NumericT val = ell_elements[offset];
568 
569  sum += val ? p[ell_coords[offset]] * val : NumericT(0);
570  }
571 
572  unsigned int col_begin = csr_rows[row];
573  unsigned int col_end = csr_rows[row + 1];
574 
575  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
576  {
577  sum += p[csr_cols[item_id]] * csr_elements[item_id];
578  }
579 
580  Ap[row] = sum;
581  inner_prod_ApAp += sum * sum;
582  inner_prod_pAp += sum * p[row];
583  }
584 
586  __shared__ NumericT shared_array_ApAp[256];
587  __shared__ NumericT shared_array_pAp[256];
588  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
589  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
590  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
591  {
592  __syncthreads();
593  if (threadIdx.x < stride)
594  {
595  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
596  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
597  }
598  }
599 
600  // write results to result array
601  if (threadIdx.x == 0) {
602  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
603  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
604  }
605 }
606 
607 
608 
609 template<typename NumericT>
611  vector_base<NumericT> const & p,
613  vector_base<NumericT> & inner_prod_buffer)
614 {
615  unsigned int size = p.size();
616  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
617 
618  pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
619  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
620  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
621  detail::cuda_arg<unsigned int>(A.handle4().cuda_handle()),
622  detail::cuda_arg<NumericT>(A.handle5().cuda_handle()),
623  static_cast<unsigned int>(A.internal_size1()),
624  static_cast<unsigned int>(A.ell_nnz()),
625  detail::cuda_arg<NumericT>(p),
626  detail::cuda_arg<NumericT>(Ap),
627  size,
628  detail::cuda_arg<NumericT>(inner_prod_buffer),
629  buffer_size_per_vector);
630  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
631 }
632 
633 
634 
636 
637 template<typename NumericT>
638 __global__ void pipelined_bicgstab_update_s_kernel(NumericT * s,
639  NumericT const * residual,
640  NumericT const * Ap,
641  unsigned int size,
642  NumericT * inner_prod_buffer,
643  unsigned int chunk_size,
644  unsigned int chunk_offset)
645 {
646  NumericT alpha = 0;
647 
648  // parallel reduction in work group to compute <r, r0> / <Ap, r0>
649  __shared__ NumericT shared_array[256];
650  __shared__ NumericT shared_array_Ap_in_r0[256];
651 
652  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
653  shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
654  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
655  {
656  __syncthreads();
657  if (threadIdx.x < stride) {
658  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
659  shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x + stride];
660  }
661  }
662 
663  // compute alpha from reduced values:
664  __syncthreads();
665  alpha = shared_array[0] / shared_array_Ap_in_r0[0];
666 
667  // run vector update and compute first stage of <s, s>
668  NumericT inner_prod_contrib = 0;
669  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
670  {
671  NumericT value_s = s[i];
672 
673  value_s = residual[i] - alpha * Ap[i];
674  inner_prod_contrib += value_s * value_s;
675 
676  s[i] = value_s;
677  }
678  __syncthreads();
679 
680  // parallel reduction in work group
681  shared_array[threadIdx.x] = inner_prod_contrib;
682  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
683  {
684  __syncthreads();
685  if (threadIdx.x < stride)
686  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
687  }
688 
689  // write results to inner_prod_buffer
690  if (threadIdx.x == 0)
691  inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
692 }
693 
694 template<typename NumericT>
697  vector_base<NumericT> const & Ap,
698  vector_base<NumericT> & inner_prod_buffer,
699  vcl_size_t buffer_chunk_size,
700  vcl_size_t buffer_chunk_offset)
701 {
702  unsigned int size = static_cast<unsigned int>(s.size());
703  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
704  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
705 
706  pipelined_bicgstab_update_s_kernel<<<256, 256>>>(detail::cuda_arg<NumericT>(s),
707  detail::cuda_arg<NumericT>(r),
708  detail::cuda_arg<NumericT>(Ap),
709  size,
710  detail::cuda_arg<NumericT>(inner_prod_buffer),
711  chunk_size,
712  chunk_offset);
713  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_update_s_kernel");
714 }
715 
716 template<typename NumericT>
717 __global__ void pipelined_bicgstab_vector_kernel(NumericT * result,
718  NumericT alpha,
719  NumericT * p,
720  NumericT omega,
721  NumericT const * s,
722  NumericT * residual,
723  NumericT const * As,
724  NumericT beta,
725  NumericT const * Ap,
726  NumericT const * r0star,
727  NumericT * inner_prod_buffer,
728  unsigned int size)
729 {
730  NumericT inner_prod_r_r0star = 0;
731  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
732  {
733  NumericT value_result = result[i];
734  NumericT value_p = p[i];
735  NumericT value_s = s[i];
736  NumericT value_residual = residual[i];
737  NumericT value_As = As[i];
738  NumericT value_Ap = Ap[i];
739  NumericT value_r0star = r0star[i];
740 
741  value_result += alpha * value_p + omega * value_s;
742  value_residual = value_s - omega * value_As;
743  value_p = value_residual + beta * (value_p - omega * value_Ap);
744 
745  result[i] = value_result;
746  residual[i] = value_residual;
747  p[i] = value_p;
748  inner_prod_r_r0star += value_residual * value_r0star;
749  }
750 
751  // parallel reduction in work group
752  __shared__ NumericT shared_array[256];
753  shared_array[threadIdx.x] = inner_prod_r_r0star;
754  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
755  {
756  __syncthreads();
757  if (threadIdx.x < stride)
758  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
759  }
760 
761  // write results to result array
762  if (threadIdx.x == 0)
763  inner_prod_buffer[blockIdx.x] = shared_array[0];
764 }
765 
766 
767 template<typename NumericT>
768 void pipelined_bicgstab_vector_update(vector_base<NumericT> & result, NumericT alpha, vector_base<NumericT> & p, NumericT omega, vector_base<NumericT> const & s,
769  vector_base<NumericT> & residual, vector_base<NumericT> const & As,
770  NumericT beta, vector_base<NumericT> const & Ap,
771  vector_base<NumericT> const & r0star,
772  vector_base<NumericT> & inner_prod_buffer, vcl_size_t buffer_chunk_size)
773 {
774  (void)buffer_chunk_size;
775  unsigned int size = static_cast<unsigned int>(result.size());
776 
777  pipelined_bicgstab_vector_kernel<<<256, 256>>>(detail::cuda_arg<NumericT>(result),
778  alpha,
779  detail::cuda_arg<NumericT>(p),
780  omega,
781  detail::cuda_arg<NumericT>(s),
782  detail::cuda_arg<NumericT>(residual),
783  detail::cuda_arg<NumericT>(As),
784  beta,
785  detail::cuda_arg<NumericT>(Ap),
786  detail::cuda_arg<NumericT>(r0star),
787  detail::cuda_arg<NumericT>(inner_prod_buffer),
788  size);
789  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_vector_kernel");
790 }
791 
792 
793 
794 //
795 // Compressed matrix
796 //
797 
798 
799 template<typename NumericT>
801  const unsigned int * row_indices,
802  const unsigned int * column_indices,
803  const unsigned int * row_blocks,
804  const NumericT * elements,
805  unsigned int num_blocks,
806  const NumericT * p,
807  NumericT * Ap,
808  const NumericT * r0star,
809  unsigned int size,
810  NumericT * inner_prod_buffer,
811  unsigned int buffer_size,
812  unsigned int buffer_offset)
813 {
814  NumericT inner_prod_ApAp = 0;
815  NumericT inner_prod_pAp = 0;
816  NumericT inner_prod_r0Ap = 0;
817 
818  __shared__ NumericT shared_elements[1024];
819 
820  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
821  {
822  unsigned int row_start = row_blocks[block_id];
823  unsigned int row_stop = row_blocks[block_id + 1];
824  unsigned int element_start = row_indices[row_start];
825  unsigned int element_stop = row_indices[row_stop];
826  unsigned int rows_to_process = row_stop - row_start;
827 
828  if (rows_to_process > 1) // CSR stream with one thread per row
829  {
830  // load to shared buffer:
831  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
832  shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
833 
834  __syncthreads();
835 
836  // use one thread per row to sum:
837  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
838  {
839  NumericT dot_prod = 0;
840  unsigned int thread_row_start = row_indices[row] - element_start;
841  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
842  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
843  dot_prod += shared_elements[i];
844  Ap[row] = dot_prod;
845  inner_prod_ApAp += dot_prod * dot_prod;
846  inner_prod_pAp += p[row] * dot_prod;
847  inner_prod_r0Ap += r0star[row] * dot_prod;
848  }
849  }
850  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
851  else // CSR vector for a single row
852  {
853  // load and sum to shared buffer:
854  shared_elements[threadIdx.x] = 0;
855  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
856  shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
857 
858  // reduction to obtain final result
859  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
860  {
861  __syncthreads();
862  if (threadIdx.x < stride)
863  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
864  }
865 
866  if (threadIdx.x == 0)
867  {
868  Ap[row_start] = shared_elements[0];
869  inner_prod_ApAp += shared_elements[0] * shared_elements[0];
870  inner_prod_pAp += p[row_start] * shared_elements[0];
871  inner_prod_r0Ap += r0star[row_start] * shared_elements[0];
872  }
873  }
874 
875  __syncthreads(); // avoid race conditions
876  }
877 
879  __shared__ NumericT shared_array_ApAp[256];
880  __shared__ NumericT shared_array_pAp[256];
881  __shared__ NumericT shared_array_r0Ap[256];
882  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
883  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
884  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
885  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
886  {
887  __syncthreads();
888  if (threadIdx.x < stride)
889  {
890  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
891  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
892  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
893  }
894  }
895 
896  // write results to result array
897  if (threadIdx.x == 0) {
898  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
899  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
900  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
901  }
902 }
903 
904 
905 
906 
907 template<typename NumericT>
909  vector_base<NumericT> const & p,
911  vector_base<NumericT> const & r0star,
912  vector_base<NumericT> & inner_prod_buffer,
913  vcl_size_t buffer_chunk_size,
914  vcl_size_t buffer_chunk_offset)
915 {
916  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
917  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
918  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
919 
920  pipelined_bicgstab_csr_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
921  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
922  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
923  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
924  static_cast<unsigned int>(A.blocks1()),
925  detail::cuda_arg<NumericT>(p),
926  detail::cuda_arg<NumericT>(Ap),
927  detail::cuda_arg<NumericT>(r0star),
928  vec_size,
929  detail::cuda_arg<NumericT>(inner_prod_buffer),
930  chunk_size,
931  chunk_offset);
932  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_csr_vec_mul_kernel");
933 }
934 
935 
936 //
937 // Coordinate Matrix
938 //
939 
940 
941 template<typename NumericT>
942 __global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
943  const NumericT * elements,
944  const unsigned int * group_boundaries,
945  const NumericT * p,
946  NumericT * Ap,
947  const NumericT * r0star,
948  unsigned int size,
949  NumericT * inner_prod_buffer,
950  unsigned int buffer_size,
951  unsigned int buffer_offset)
952 {
953  NumericT inner_prod_ApAp = 0;
954  NumericT inner_prod_pAp = 0;
955  NumericT inner_prod_r0Ap = 0;
956  __shared__ unsigned int shared_rows[128];
957  __shared__ NumericT inter_results[128];
958 
959  uint2 tmp;
960  NumericT val;
961  unsigned int group_start = group_boundaries[blockIdx.x];
962  unsigned int group_end = group_boundaries[blockIdx.x + 1];
963  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
964 
965  unsigned int local_index = 0;
966 
967  for (unsigned int k = 0; k < k_end; ++k)
968  {
969  local_index = group_start + k * blockDim.x + threadIdx.x;
970 
971  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
972  val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
973 
974  //check for carry from previous loop run:
975  if (threadIdx.x == 0 && k > 0)
976  {
977  if (tmp.x == shared_rows[blockDim.x-1])
978  val += inter_results[blockDim.x-1];
979  else
980  {
981  NumericT Ap_entry = inter_results[blockDim.x-1];
982  Ap[shared_rows[blockDim.x-1]] = Ap_entry;
983  inner_prod_ApAp += Ap_entry * Ap_entry;
984  inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
985  inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
986  }
987  }
988 
989  //segmented parallel reduction begin
990  __syncthreads();
991  shared_rows[threadIdx.x] = tmp.x;
992  inter_results[threadIdx.x] = val;
993  NumericT left = 0;
994  __syncthreads();
995 
996  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
997  {
998  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
999  __syncthreads();
1000  inter_results[threadIdx.x] += left;
1001  __syncthreads();
1002  }
1003  //segmented parallel reduction end
1004 
1005  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1006  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1007  {
1008  NumericT Ap_entry = inter_results[threadIdx.x];
1009  Ap[tmp.x] = Ap_entry;
1010  inner_prod_ApAp += Ap_entry * Ap_entry;
1011  inner_prod_pAp += Ap_entry * p[tmp.x];
1012  inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
1013  }
1014 
1015  __syncthreads();
1016  } //for k
1017 
1018  if (local_index + 1 == group_end)
1019  {
1020  NumericT Ap_entry = inter_results[threadIdx.x];
1021  Ap[tmp.x] = Ap_entry;
1022  inner_prod_ApAp += Ap_entry * Ap_entry;
1023  inner_prod_pAp += Ap_entry * p[tmp.x];
1024  inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
1025  }
1026 
1028  __shared__ NumericT shared_array_ApAp[256];
1029  __shared__ NumericT shared_array_pAp[256];
1030  __shared__ NumericT shared_array_r0Ap[256];
1031  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1032  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1033  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1034  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1035  {
1036  __syncthreads();
1037  if (threadIdx.x < stride)
1038  {
1039  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1040  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1041  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1042  }
1043  }
1044 
1045  // write results to result array
1046  if (threadIdx.x == 0) {
1047  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1048  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1049  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1050  }
1051 
1052 }
1053 
1054 
1055 template<typename NumericT>
1057  vector_base<NumericT> const & p,
1058  vector_base<NumericT> & Ap,
1059  vector_base<NumericT> const & r0star,
1060  vector_base<NumericT> & inner_prod_buffer,
1061  vcl_size_t buffer_chunk_size,
1062  vcl_size_t buffer_chunk_offset)
1063 {
1064  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1065  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1066  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1067 
1068  Ap.clear();
1069 
1070  pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.handle12().cuda_handle()),
1071  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
1072  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1073  detail::cuda_arg<NumericT>(p),
1074  detail::cuda_arg<NumericT>(Ap),
1075  detail::cuda_arg<NumericT>(r0star),
1076  vec_size,
1077  detail::cuda_arg<NumericT>(inner_prod_buffer),
1078  chunk_size,
1079  chunk_offset);
1080  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_coo_vec_mul_kernel");
1081 }
1082 
1083 
1084 
1085 //
1086 // ELL Matrix
1087 //
1088 
1089 template<typename NumericT>
1090 __global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int * coords,
1091  const NumericT * elements,
1092  unsigned int internal_row_num,
1093  unsigned int items_per_row,
1094  const NumericT * p,
1095  NumericT * Ap,
1096  const NumericT * r0star,
1097  unsigned int size,
1098  NumericT * inner_prod_buffer,
1099  unsigned int buffer_size,
1100  unsigned int buffer_offset)
1101 {
1102  NumericT inner_prod_ApAp = 0;
1103  NumericT inner_prod_pAp = 0;
1104  NumericT inner_prod_r0Ap = 0;
1105  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1106  unsigned int glb_sz = gridDim.x * blockDim.x;
1107 
1108  for (unsigned int row = glb_id; row < size; row += glb_sz)
1109  {
1110  NumericT sum = 0;
1111 
1112  unsigned int offset = row;
1113  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1114  {
1115  NumericT val = elements[offset];
1116  sum += val ? p[coords[offset]] * val : NumericT(0);
1117  }
1118 
1119  Ap[row] = sum;
1120  inner_prod_ApAp += sum * sum;
1121  inner_prod_pAp += sum * p[row];
1122  inner_prod_r0Ap += sum * r0star[row];
1123  }
1124 
1126  __shared__ NumericT shared_array_ApAp[256];
1127  __shared__ NumericT shared_array_pAp[256];
1128  __shared__ NumericT shared_array_r0Ap[256];
1129  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1130  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1131  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1132  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1133  {
1134  __syncthreads();
1135  if (threadIdx.x < stride)
1136  {
1137  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1138  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1139  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1140  }
1141  }
1142 
1143  // write results to result array
1144  if (threadIdx.x == 0) {
1145  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1146  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1147  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1148  }
1149 }
1150 
1151 
1152 template<typename NumericT>
1154  vector_base<NumericT> const & p,
1155  vector_base<NumericT> & Ap,
1156  vector_base<NumericT> const & r0star,
1157  vector_base<NumericT> & inner_prod_buffer,
1158  vcl_size_t buffer_chunk_size,
1159  vcl_size_t buffer_chunk_offset)
1160 {
1161  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1162  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1163  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1164 
1165  pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1166  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
1167  static_cast<unsigned int>(A.internal_size1()),
1168  static_cast<unsigned int>(A.maxnnz()),
1169  detail::cuda_arg<NumericT>(p),
1170  detail::cuda_arg<NumericT>(Ap),
1171  detail::cuda_arg<NumericT>(r0star),
1172  vec_size,
1173  detail::cuda_arg<NumericT>(inner_prod_buffer),
1174  chunk_size,
1175  chunk_offset);
1176  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_ell_vec_mul_kernel");
1177 }
1178 
1179 
1180 //
1181 // SELL-C-\sigma Matrix
1182 //
1183 
1184 template<typename NumericT>
1185 __global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int * columns_per_block,
1186  const unsigned int * column_indices,
1187  const unsigned int * block_start,
1188  const NumericT * elements,
1189  const NumericT * p,
1190  NumericT * Ap,
1191  const NumericT * r0star,
1192  unsigned int size,
1193  NumericT * inner_prod_buffer,
1194  unsigned int buffer_size,
1195  unsigned int buffer_offset)
1196 {
1197  NumericT inner_prod_ApAp = 0;
1198  NumericT inner_prod_pAp = 0;
1199  NumericT inner_prod_r0Ap = 0;
1200  unsigned int local_id = threadIdx.x;
1201  unsigned int local_size = blockDim.x;
1202 
1203  for (unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
1204  {
1205  unsigned int row = block_idx * local_size + local_id;
1206  unsigned int offset = block_start[block_idx];
1207  unsigned int num_columns = columns_per_block[block_idx];
1208 
1209  NumericT sum = 0;
1210  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
1211  {
1212  unsigned int index = offset + item_id * local_size + local_id;
1213  NumericT val = elements[index];
1214 
1215  sum += val ? (p[column_indices[index]] * val) : 0;
1216  }
1217 
1218  if (row < size)
1219  {
1220  Ap[row] = sum;
1221  inner_prod_ApAp += sum * sum;
1222  inner_prod_pAp += sum * p[row];
1223  inner_prod_r0Ap += sum * r0star[row];
1224  }
1225  }
1226 
1228  __shared__ NumericT shared_array_ApAp[256];
1229  __shared__ NumericT shared_array_pAp[256];
1230  __shared__ NumericT shared_array_r0Ap[256];
1231  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1232  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1233  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1234  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1235  {
1236  __syncthreads();
1237  if (threadIdx.x < stride)
1238  {
1239  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1240  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1241  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1242  }
1243  }
1244 
1245  // write results to result array
1246  if (threadIdx.x == 0) {
1247  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1248  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1249  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1250  }
1251 }
1252 
1253 template<typename NumericT>
1255  vector_base<NumericT> const & p,
1256  vector_base<NumericT> & Ap,
1257  vector_base<NumericT> const & r0star,
1258  vector_base<NumericT> & inner_prod_buffer,
1259  vcl_size_t buffer_chunk_size,
1260  vcl_size_t buffer_chunk_offset)
1261 {
1262  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1263  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1264  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1265 
1266  pipelined_bicgstab_sliced_ell_vec_mul_kernel<<<256, A.rows_per_block()>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
1267  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1268  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1269  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
1270  detail::cuda_arg<NumericT>(p),
1271  detail::cuda_arg<NumericT>(Ap),
1272  detail::cuda_arg<NumericT>(r0star),
1273  vec_size,
1274  detail::cuda_arg<NumericT>(inner_prod_buffer),
1275  chunk_size,
1276  chunk_offset);
1277  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_sliced_ell_vec_mul_kernel");
1278 }
1279 
1280 
1281 //
1282 // Hybrid Matrix
1283 //
1284 
1285 
1286 template<typename NumericT>
1287 __global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int * ell_coords,
1288  const NumericT * ell_elements,
1289  const unsigned int * csr_rows,
1290  const unsigned int * csr_cols,
1291  const NumericT * csr_elements,
1292  unsigned int internal_row_num,
1293  unsigned int items_per_row,
1294  const NumericT * p,
1295  NumericT * Ap,
1296  const NumericT * r0star,
1297  unsigned int size,
1298  NumericT * inner_prod_buffer,
1299  unsigned int buffer_size,
1300  unsigned int buffer_offset)
1301 {
1302  NumericT inner_prod_ApAp = 0;
1303  NumericT inner_prod_pAp = 0;
1304  NumericT inner_prod_r0Ap = 0;
1305  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1306  unsigned int glb_sz = gridDim.x * blockDim.x;
1307 
1308  for (unsigned int row = glb_id; row < size; row += glb_sz)
1309  {
1310  NumericT sum = 0;
1311 
1312  unsigned int offset = row;
1313  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1314  {
1315  NumericT val = ell_elements[offset];
1316 
1317  sum += val ? p[ell_coords[offset]] * val : NumericT(0);
1318  }
1319 
1320  unsigned int col_begin = csr_rows[row];
1321  unsigned int col_end = csr_rows[row + 1];
1322 
1323  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
1324  {
1325  sum += p[csr_cols[item_id]] * csr_elements[item_id];
1326  }
1327 
1328  Ap[row] = sum;
1329  inner_prod_ApAp += sum * sum;
1330  inner_prod_pAp += sum * p[row];
1331  inner_prod_r0Ap += sum * r0star[row];
1332  }
1333 
1335  __shared__ NumericT shared_array_ApAp[256];
1336  __shared__ NumericT shared_array_pAp[256];
1337  __shared__ NumericT shared_array_r0Ap[256];
1338  shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1339  shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1340  shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1341  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1342  {
1343  __syncthreads();
1344  if (threadIdx.x < stride)
1345  {
1346  shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x + stride];
1347  shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x + stride];
1348  shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x + stride];
1349  }
1350  }
1351 
1352  // write results to result array
1353  if (threadIdx.x == 0) {
1354  inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1355  inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1356  inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1357  }
1358 }
1359 
1360 
1361 
1362 template<typename NumericT>
1364  vector_base<NumericT> const & p,
1365  vector_base<NumericT> & Ap,
1366  vector_base<NumericT> const & r0star,
1367  vector_base<NumericT> & inner_prod_buffer,
1368  vcl_size_t buffer_chunk_size,
1369  vcl_size_t buffer_chunk_offset)
1370 {
1371  unsigned int vec_size = static_cast<unsigned int>(viennacl::traits::size(p));
1372  unsigned int chunk_size = static_cast<unsigned int>(buffer_chunk_size);
1373  unsigned int chunk_offset = static_cast<unsigned int>(buffer_chunk_offset);
1374 
1375  pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1376  detail::cuda_arg<NumericT>(A.handle().cuda_handle()),
1377  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1378  detail::cuda_arg<unsigned int>(A.handle4().cuda_handle()),
1379  detail::cuda_arg<NumericT>(A.handle5().cuda_handle()),
1380  static_cast<unsigned int>(A.internal_size1()),
1381  static_cast<unsigned int>(A.ell_nnz()),
1382  detail::cuda_arg<NumericT>(p),
1383  detail::cuda_arg<NumericT>(Ap),
1384  detail::cuda_arg<NumericT>(r0star),
1385  vec_size,
1386  detail::cuda_arg<NumericT>(inner_prod_buffer),
1387  chunk_size,
1388  chunk_offset);
1389  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_bicgstab_hyb_vec_mul_kernel");
1390 }
1391 
1393 
1394 template <typename T>
1396  unsigned int vk_offset,
1397  T const * residual,
1398  T * R_buffer,
1399  unsigned int R_offset,
1400  T const * inner_prod_buffer,
1401  unsigned int chunk_size,
1402  T * r_dot_vk_buffer,
1403  unsigned int chunk_offset,
1404  unsigned int size)
1405 {
1406  __shared__ T shared_array[128];
1407  T norm_vk = 0;
1408 
1409  // parallel reduction in work group to compute <vk, vk>
1410  shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
1411  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1412  {
1413  __syncthreads();
1414  if (threadIdx.x < stride)
1415  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1416  }
1417 
1418  // compute alpha from reduced values:
1419  __syncthreads();
1420  norm_vk = sqrt(shared_array[0]);
1421 
1422  T inner_prod_contrib = 0;
1423  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1424  T value_vk = vk[i + vk_offset] / norm_vk;
1425 
1426  inner_prod_contrib += residual[i] * value_vk;
1427 
1428  vk[i + vk_offset] = value_vk;
1429  }
1430  __syncthreads();
1431 
1432  // parallel reduction in work group
1433  shared_array[threadIdx.x] = inner_prod_contrib;
1434  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1435  {
1436  __syncthreads();
1437  if (threadIdx.x < stride)
1438  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1439  }
1440 
1441  // write results of first reduction stage:
1442  if (threadIdx.x == 0)
1443  r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
1444  // store norm:
1445  if (blockDim.x * blockIdx.x + threadIdx.x == 0)
1446  R_buffer[R_offset] = norm_vk;
1447 }
1448 
1456 template <typename T>
1458  vector_base<T> const & residual,
1459  vector_base<T> & R_buffer,
1460  vcl_size_t offset_in_R,
1461  vector_base<T> const & inner_prod_buffer,
1462  vector_base<T> & r_dot_vk_buffer,
1463  vcl_size_t buffer_chunk_size,
1464  vcl_size_t buffer_chunk_offset)
1465 {
1466  unsigned int vk_offset = viennacl::traits::start(v_k);
1467  unsigned int R_offset = offset_in_R;
1468  unsigned int chunk_size = buffer_chunk_size;
1469  unsigned int chunk_offset = buffer_chunk_offset;
1470  unsigned int size = v_k.size();
1471 
1472  pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(detail::cuda_arg<T>(v_k),
1473  vk_offset,
1474  detail::cuda_arg<T>(residual),
1475  detail::cuda_arg<T>(R_buffer),
1476  R_offset,
1477  detail::cuda_arg<T>(inner_prod_buffer),
1478  chunk_size,
1479  detail::cuda_arg<T>(r_dot_vk_buffer),
1480  chunk_offset,
1481  size);
1482  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_normalize_vk_kernel");
1483 }
1484 
1485 
1486 
1487 template <typename T>
1488 __global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const * krylov_basis,
1489  unsigned int size,
1490  unsigned int internal_size,
1491  unsigned int k,
1492  T * vi_in_vk_buffer,
1493  unsigned int chunk_size)
1494 {
1495  __shared__ T shared_array[7*128];
1496  T value_vk = 0;
1497 
1498  unsigned int k_base = 0;
1499  while (k_base < k)
1500  {
1501  unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1502 
1503  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1504  shared_array[threadIdx.x + j*chunk_size] = 0;
1505 
1506  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1507  {
1508  value_vk = krylov_basis[i + k * internal_size];
1509 
1510  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1511  shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size];
1512  }
1513 
1514  // parallel reduction in work group
1515  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1516  {
1517  __syncthreads();
1518  if (threadIdx.x < stride) {
1519  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1520  shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
1521  }
1522  }
1523 
1524  // write results to result array
1525  if (threadIdx.x == 0)
1526  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1527  vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
1528 
1529  k_base += vecs_in_iteration;
1530  }
1531 
1532 }
1533 
1534 template <typename T>
1535 void pipelined_gmres_gram_schmidt_stage1(vector_base<T> const & device_krylov_basis,
1536  vcl_size_t v_k_size,
1537  vcl_size_t v_k_internal_size,
1538  vcl_size_t param_k,
1539  vector_base<T> & vi_in_vk_buffer,
1540  vcl_size_t buffer_chunk_size)
1541 {
1542  unsigned int chunk_size = buffer_chunk_size;
1543  unsigned int size = v_k_size;
1544  unsigned int internal_size = v_k_internal_size;
1545  unsigned int k = param_k;
1546 
1547  pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1548  size,
1549  internal_size,
1550  k,
1551  detail::cuda_arg<T>(vi_in_vk_buffer),
1552  chunk_size);
1553  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage1_kernel");
1554 }
1555 
1556 
1557 
1558 
1559 template <typename T>
1560 __global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T * krylov_basis,
1561  unsigned int size,
1562  unsigned int internal_size,
1563  unsigned int k,
1564  T const * vi_in_vk_buffer,
1565  unsigned int chunk_size,
1566  T * R_buffer,
1567  unsigned int krylov_dim,
1568  T * inner_prod_buffer)
1569 {
1570  __shared__ T shared_array[7*128];
1571  T vk_dot_vk = 0;
1572  T value_vk = 0;
1573 
1574  unsigned int k_base = 0;
1575  while (k_base < k)
1576  {
1577  unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1578 
1579  // parallel reduction in work group for <v_i, v_k>
1580  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1581  shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
1582  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1583  {
1584  __syncthreads();
1585  if (threadIdx.x < stride) {
1586  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1587  shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size + stride];
1588  }
1589  }
1590  __syncthreads();
1591 
1592  // v_k -= <v_i, v_k> v_i:
1593  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1594  {
1595  value_vk = krylov_basis[i + k * internal_size];
1596 
1597  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1598  value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size];
1599  vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
1600  krylov_basis[i + k * internal_size] = value_vk;
1601  }
1602 
1603  // write to R: (to avoid thread divergence, all threads write the same value)
1604  if (blockIdx.x == 0)
1605  for (unsigned int j=0; j<vecs_in_iteration; ++j)
1606  R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
1607  __syncthreads();
1608 
1609  k_base += vecs_in_iteration;
1610  }
1611 
1612  // parallel reduction in work group for <v_k, v_k>
1613  shared_array[threadIdx.x] = vk_dot_vk;
1614  for (unsigned int stride=blockDim.x/2; stride > 0; stride /= 2)
1615  {
1616  __syncthreads();
1617  if (threadIdx.x < stride)
1618  shared_array[threadIdx.x] += shared_array[threadIdx.x + stride];
1619  }
1620 
1621  // write results to result array
1622  if (threadIdx.x == 0)
1623  inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
1624 }
1625 
1626 template <typename T>
1628  vcl_size_t v_k_size,
1629  vcl_size_t v_k_internal_size,
1630  vcl_size_t param_k,
1631  vector_base<T> const & vi_in_vk_buffer,
1632  vector_base<T> & R_buffer,
1633  vcl_size_t krylov_dim,
1634  vector_base<T> & inner_prod_buffer,
1635  vcl_size_t buffer_chunk_size)
1636 {
1637  unsigned int chunk_size = buffer_chunk_size;
1638  unsigned int size = v_k_size;
1639  unsigned int internal_size = v_k_internal_size;
1640  unsigned int k = param_k;
1641  unsigned int krylov = krylov_dim;
1642 
1643  pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1644  size,
1645  internal_size,
1646  k,
1647  detail::cuda_arg<T>(vi_in_vk_buffer),
1648  chunk_size,
1649  detail::cuda_arg<T>(R_buffer),
1650  krylov,
1651  detail::cuda_arg<T>(inner_prod_buffer));
1652  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_gram_schmidt_stage2_kernel");
1653 }
1654 
1655 
1656 
1657 
1658 template <typename T>
1659 __global__ void pipelined_gmres_update_result_kernel(T * result,
1660  T const * residual,
1661  T const * krylov_basis,
1662  unsigned int size,
1663  unsigned int internal_size,
1664  T const * coefficients,
1665  unsigned int k)
1666 {
1667  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1668  {
1669  T value_result = result[i] + coefficients[0] * residual[i];
1670 
1671  for (unsigned int j = 1; j < k; ++j)
1672  value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size];
1673 
1674  result[i] = value_result;
1675  }
1676 }
1677 
1678 template <typename T>
1680  vector_base<T> const & residual,
1681  vector_base<T> const & krylov_basis,
1682  vcl_size_t v_k_size,
1683  vcl_size_t v_k_internal_size,
1684  vector_base<T> const & coefficients,
1685  vcl_size_t param_k)
1686 {
1687  unsigned int size = v_k_size;
1688  unsigned int internal_size = v_k_internal_size;
1689  unsigned int k = param_k;
1690 
1691  pipelined_gmres_update_result_kernel<<<128, 128>>>(detail::cuda_arg<T>(result),
1692  detail::cuda_arg<T>(residual),
1693  detail::cuda_arg<T>(krylov_basis),
1694  size,
1695  internal_size,
1696  detail::cuda_arg<T>(coefficients),
1697  k);
1698  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_gmres_update_result_kernel");
1699 }
1700 
1701 
1702 
1703 template <typename T>
1705  vector_base<T> const & p,
1706  vector_base<T> & Ap,
1707  vector_base<T> & inner_prod_buffer)
1708 {
1709  unsigned int size = p.size();
1710  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1711 
1712  pipelined_cg_csr_vec_mul_kernel<<<128, 256>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
1713  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1714  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1715  detail::cuda_arg<T>(A.handle().cuda_handle()),
1716  static_cast<unsigned int>(A.blocks1()),
1717  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1718  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1719  size,
1720  detail::cuda_arg<T>(inner_prod_buffer),
1721  buffer_size_per_vector);
1722  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_csr_vec_mul_kernel");
1723 }
1724 
1725 template <typename T>
1727  vector_base<T> const & p,
1728  vector_base<T> & Ap,
1729  vector_base<T> & inner_prod_buffer)
1730 {
1731  unsigned int size = p.size();
1732  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1733 
1734  Ap.clear();
1735 
1736  pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.handle12().cuda_handle()),
1737  detail::cuda_arg<T>(A.handle().cuda_handle()),
1738  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1739  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1740  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1741  size,
1742  detail::cuda_arg<T>(inner_prod_buffer),
1743  buffer_size_per_vector);
1744  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_coo_vec_mul_kernel");
1745 }
1746 
1747 template <typename T>
1749  vector_base<T> const & p,
1750  vector_base<T> & Ap,
1751  vector_base<T> & inner_prod_buffer)
1752 {
1753  unsigned int size = p.size();
1754  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1755 
1756  pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1757  detail::cuda_arg<T>(A.handle().cuda_handle()),
1758  static_cast<unsigned int>(A.internal_size1()),
1759  static_cast<unsigned int>(A.maxnnz()),
1760  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1761  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1762  size,
1763  detail::cuda_arg<T>(inner_prod_buffer),
1764  buffer_size_per_vector);
1765  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_ell_vec_mul_kernel");
1766 }
1767 
1768 template <typename T>
1770  vector_base<T> const & p,
1771  vector_base<T> & Ap,
1772  vector_base<T> & inner_prod_buffer)
1773 {
1774  unsigned int size = p.size();
1775  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1776 
1777  pipelined_cg_sliced_ell_vec_mul_kernel<<<128, A.rows_per_block()>>>(detail::cuda_arg<unsigned int>(A.handle1().cuda_handle()),
1778  detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1779  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1780  detail::cuda_arg<T>(A.handle().cuda_handle()),
1781  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1782  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1783  size,
1784  detail::cuda_arg<T>(inner_prod_buffer),
1785  buffer_size_per_vector);
1786  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_sliced_ell_vec_mul_kernel");
1787 }
1788 
1789 
1790 template <typename T>
1792  vector_base<T> const & p,
1793  vector_base<T> & Ap,
1794  vector_base<T> & inner_prod_buffer)
1795 {
1796  unsigned int size = p.size();
1797  unsigned int buffer_size_per_vector = static_cast<unsigned int>(inner_prod_buffer.size()) / static_cast<unsigned int>(3);
1798 
1799  pipelined_cg_hyb_vec_mul_kernel<<<128, 256>>>(detail::cuda_arg<unsigned int>(A.handle2().cuda_handle()),
1800  detail::cuda_arg<T>(A.handle().cuda_handle()),
1801  detail::cuda_arg<unsigned int>(A.handle3().cuda_handle()),
1802  detail::cuda_arg<unsigned int>(A.handle4().cuda_handle()),
1803  detail::cuda_arg<T>(A.handle5().cuda_handle()),
1804  static_cast<unsigned int>(A.internal_size1()),
1805  static_cast<unsigned int>(A.ell_nnz()),
1806  detail::cuda_arg<T>(p) + viennacl::traits::start(p),
1807  detail::cuda_arg<T>(Ap) + viennacl::traits::start(Ap),
1808  size,
1809  detail::cuda_arg<T>(inner_prod_buffer),
1810  buffer_size_per_vector);
1811  VIENNACL_CUDA_LAST_ERROR_CHECK("pipelined_cg_hyb_vec_mul_kernel");
1812 }
1813 
1814 
1815 
1816 } // namespace cuda
1817 } //namespace linalg
1818 } //namespace viennacl
1819 
1820 
1821 #endif
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T const *vi_in_vk_buffer, unsigned int chunk_size, T *R_buffer, unsigned int krylov_dim, T *inner_prod_buffer)
handle_type & handle2()
Definition: ell_matrix.hpp:103
__global__ void pipelined_cg_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT *r, NumericT const *Ap, NumericT beta, NumericT *inner_prod_buffer, unsigned int size)
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
__global__ void pipelined_gmres_normalize_vk_kernel(T *vk, unsigned int vk_offset, T const *residual, T *R_buffer, unsigned int R_offset, T const *inner_prod_buffer, unsigned int chunk_size, T *r_dot_vk_buffer, unsigned int chunk_offset, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
Definition: size.hpp:268
__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t rows_per_block() const
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
Definition: preset.hpp:246
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
__global__ void pipelined_bicgstab_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT omega, NumericT const *s, NumericT *residual, NumericT const *As, NumericT beta, NumericT const *Ap, NumericT const *r0star, NumericT *inner_prod_buffer, unsigned int size)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
result_of::size_type< T >::type start(T const &obj)
Definition: start.hpp:44
__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
std::size_t vcl_size_t
Definition: forwards.h:74
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
handle_type & handle()
Definition: ell_matrix.hpp:100
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:861
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
__global__ void pipelined_gmres_update_result_kernel(T *result, T const *residual, T const *krylov_basis, unsigned int size, unsigned int internal_size, T const *coefficients, unsigned int k)
__global__ void pipelined_bicgstab_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T *vi_in_vk_buffer, unsigned int chunk_size)
__global__ void pipelined_cg_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_bicgstab_update_s_kernel(NumericT *s, NumericT const *residual, NumericT const *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int chunk_size, unsigned int chunk_offset)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
Implementation of the ViennaCL scalar class.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...