1 #ifndef VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_ITERATIVE_OPERATIONS_HPP_
43 template<
typename NumericT>
50 NumericT * inner_prod_buffer,
53 NumericT inner_prod_contrib = 0;
54 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
56 NumericT value_p = p[i];
57 NumericT value_r = r[i];
59 result[i] += alpha * value_p;
60 value_r -= alpha * Ap[i];
61 value_p = value_r + beta * value_p;
65 inner_prod_contrib += value_r * value_r;
69 __shared__ NumericT shared_array[256];
70 shared_array[threadIdx.x] = inner_prod_contrib;
75 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
80 inner_prod_buffer[blockIdx.x] = shared_array[0];
84 template<
typename NumericT>
94 pipelined_cg_vector_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(result),
96 detail::cuda_arg<NumericT>(p),
97 detail::cuda_arg<NumericT>(r),
98 detail::cuda_arg<NumericT>(Ap),
100 detail::cuda_arg<NumericT>(inner_prod_buffer),
113 template<
typename NumericT>
115 const unsigned int * row_indices,
116 const unsigned int * column_indices,
117 const unsigned int * row_blocks,
118 const NumericT * elements,
119 unsigned int num_blocks,
123 NumericT * inner_prod_buffer,
124 unsigned int buffer_size)
126 NumericT inner_prod_ApAp = 0;
127 NumericT inner_prod_pAp = 0;
129 __shared__ NumericT shared_elements[1024];
131 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
133 unsigned int row_start = row_blocks[block_id];
134 unsigned int row_stop = row_blocks[block_id + 1];
135 unsigned int element_start = row_indices[row_start];
136 unsigned int element_stop = row_indices[row_stop];
137 unsigned int rows_to_process = row_stop - row_start;
139 if (rows_to_process > 1)
142 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
143 shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
148 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
151 unsigned int thread_row_start = row_indices[
row] - element_start;
152 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
153 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
154 dot_prod += shared_elements[i];
156 inner_prod_ApAp += dot_prod *
dot_prod;
164 shared_elements[threadIdx.x] = 0;
165 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
166 shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
173 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
176 if (threadIdx.x == 0)
178 Ap[row_start] = shared_elements[0];
179 inner_prod_ApAp += shared_elements[0] * shared_elements[0];
180 inner_prod_pAp += p[row_start] * shared_elements[0];
188 __shared__ NumericT shared_array_ApAp[256];
189 __shared__ NumericT shared_array_pAp[256];
190 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
191 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
197 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
198 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
203 if (threadIdx.x == 0) {
204 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
205 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
212 template<
typename NumericT>
219 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
221 pipelined_cg_csr_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
222 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
223 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
224 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
225 static_cast<unsigned int>(A.
blocks1()),
226 detail::cuda_arg<NumericT>(p),
227 detail::cuda_arg<NumericT>(Ap),
229 detail::cuda_arg<NumericT>(inner_prod_buffer),
230 buffer_size_per_vector);
240 template<
typename NumericT>
242 const NumericT * elements,
243 const unsigned int * group_boundaries,
247 NumericT * inner_prod_buffer,
248 unsigned int buffer_size)
250 NumericT inner_prod_ApAp = 0;
251 NumericT inner_prod_pAp = 0;
252 __shared__
unsigned int shared_rows[128];
253 __shared__ NumericT inter_results[128];
257 unsigned int group_start = group_boundaries[blockIdx.x];
258 unsigned int group_end = group_boundaries[blockIdx.x + 1];
259 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
261 unsigned int local_index = 0;
263 for (
unsigned int k = 0; k < k_end; ++k)
265 local_index = group_start + k * blockDim.x + threadIdx.x;
267 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
268 val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
271 if (threadIdx.x == 0 && k > 0)
273 if (tmp.x == shared_rows[blockDim.x-1])
274 val += inter_results[blockDim.x-1];
277 NumericT Ap_entry = inter_results[blockDim.x-1];
278 Ap[shared_rows[blockDim.x-1]] = Ap_entry;
279 inner_prod_ApAp += Ap_entry * Ap_entry;
280 inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
286 shared_rows[threadIdx.x] = tmp.x;
287 inter_results[threadIdx.x] = val;
293 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
295 inter_results[threadIdx.x] += left;
300 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
301 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
303 NumericT Ap_entry = inter_results[threadIdx.x];
304 Ap[tmp.x] = Ap_entry;
305 inner_prod_ApAp += Ap_entry * Ap_entry;
306 inner_prod_pAp += Ap_entry * p[tmp.x];
312 if (local_index + 1 == group_end)
314 NumericT Ap_entry = inter_results[threadIdx.x];
315 Ap[tmp.x] = Ap_entry;
316 inner_prod_ApAp += Ap_entry * Ap_entry;
317 inner_prod_pAp += Ap_entry * p[tmp.x];
321 __shared__ NumericT shared_array_ApAp[256];
322 __shared__ NumericT shared_array_pAp[256];
323 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
324 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
330 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
331 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
336 if (threadIdx.x == 0) {
337 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
338 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
344 template<
typename NumericT>
351 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
355 pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.
handle12().cuda_handle()),
356 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
357 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
358 detail::cuda_arg<NumericT>(p),
359 detail::cuda_arg<NumericT>(Ap),
361 detail::cuda_arg<NumericT>(inner_prod_buffer),
362 buffer_size_per_vector);
372 template<
typename NumericT>
374 const NumericT * elements,
375 unsigned int internal_row_num,
376 unsigned int items_per_row,
380 NumericT * inner_prod_buffer,
381 unsigned int buffer_size)
383 NumericT inner_prod_ApAp = 0;
384 NumericT inner_prod_pAp = 0;
385 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
386 unsigned int glb_sz = gridDim.x * blockDim.x;
392 unsigned int offset =
row;
393 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
395 NumericT val = elements[offset];
396 sum += val ? p[coords[offset]] * val : NumericT(0);
400 inner_prod_ApAp += sum *
sum;
401 inner_prod_pAp += sum * p[
row];
405 __shared__ NumericT shared_array_ApAp[256];
406 __shared__ NumericT shared_array_pAp[256];
407 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
408 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
414 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
415 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
420 if (threadIdx.x == 0) {
421 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
422 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
427 template<
typename NumericT>
434 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
436 pipelined_cg_ell_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
437 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
439 static_cast<unsigned int>(A.
maxnnz()),
440 detail::cuda_arg<NumericT>(p),
441 detail::cuda_arg<NumericT>(Ap),
443 detail::cuda_arg<NumericT>(inner_prod_buffer),
444 buffer_size_per_vector);
453 template<
typename NumericT>
455 const unsigned int * column_indices,
456 const unsigned int * block_start,
457 const NumericT * elements,
461 NumericT * inner_prod_buffer,
462 unsigned int buffer_size)
464 NumericT inner_prod_ApAp = 0;
465 NumericT inner_prod_pAp = 0;
466 unsigned int local_id = threadIdx.x;
467 unsigned int local_size = blockDim.x;
469 for (
unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
471 unsigned int row = block_idx * local_size + local_id;
472 unsigned int offset = block_start[block_idx];
473 unsigned int num_columns = columns_per_block[block_idx];
476 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
478 unsigned int index = offset + item_id * local_size + local_id;
479 NumericT val = elements[index];
481 sum += val ? (p[column_indices[index]] * val) : 0;
487 inner_prod_ApAp += sum *
sum;
488 inner_prod_pAp += sum * p[
row];
493 __shared__ NumericT shared_array_ApAp[256];
494 __shared__ NumericT shared_array_pAp[256];
495 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
496 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
502 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
503 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
508 if (threadIdx.x == 0) {
509 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
510 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
514 template<
typename NumericT>
521 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
524 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
525 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
526 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
527 detail::cuda_arg<NumericT>(p),
528 detail::cuda_arg<NumericT>(Ap),
530 detail::cuda_arg<NumericT>(inner_prod_buffer),
531 buffer_size_per_vector);
541 template<
typename NumericT>
543 const NumericT * ell_elements,
544 const unsigned int * csr_rows,
545 const unsigned int * csr_cols,
546 const NumericT * csr_elements,
547 unsigned int internal_row_num,
548 unsigned int items_per_row,
552 NumericT * inner_prod_buffer,
553 unsigned int buffer_size)
555 NumericT inner_prod_ApAp = 0;
556 NumericT inner_prod_pAp = 0;
557 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
558 unsigned int glb_sz = gridDim.x * blockDim.x;
564 unsigned int offset =
row;
565 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
567 NumericT val = ell_elements[offset];
569 sum += val ? p[ell_coords[offset]] * val : NumericT(0);
572 unsigned int col_begin = csr_rows[
row];
573 unsigned int col_end = csr_rows[
row + 1];
575 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
577 sum += p[csr_cols[item_id]] * csr_elements[item_id];
581 inner_prod_ApAp += sum *
sum;
582 inner_prod_pAp += sum * p[
row];
586 __shared__ NumericT shared_array_ApAp[256];
587 __shared__ NumericT shared_array_pAp[256];
588 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
589 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
595 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
596 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
601 if (threadIdx.x == 0) {
602 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
603 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
609 template<
typename NumericT>
616 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
618 pipelined_cg_hyb_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
619 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
620 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
621 detail::cuda_arg<unsigned int>(A.
handle4().cuda_handle()),
622 detail::cuda_arg<NumericT>(A.
handle5().cuda_handle()),
624 static_cast<unsigned int>(A.
ell_nnz()),
625 detail::cuda_arg<NumericT>(p),
626 detail::cuda_arg<NumericT>(Ap),
628 detail::cuda_arg<NumericT>(inner_prod_buffer),
629 buffer_size_per_vector);
637 template<
typename NumericT>
639 NumericT
const * residual,
642 NumericT * inner_prod_buffer,
643 unsigned int chunk_size,
644 unsigned int chunk_offset)
649 __shared__ NumericT shared_array[256];
650 __shared__ NumericT shared_array_Ap_in_r0[256];
652 shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x];
653 shared_array_Ap_in_r0[threadIdx.x] = inner_prod_buffer[threadIdx.x + 3 * chunk_size];
657 if (threadIdx.x <
stride) {
658 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
659 shared_array_Ap_in_r0[threadIdx.x] += shared_array_Ap_in_r0[threadIdx.x +
stride];
665 alpha = shared_array[0] / shared_array_Ap_in_r0[0];
668 NumericT inner_prod_contrib = 0;
669 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
671 NumericT value_s = s[i];
673 value_s = residual[i] - alpha * Ap[i];
674 inner_prod_contrib += value_s * value_s;
681 shared_array[threadIdx.x] = inner_prod_contrib;
686 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
690 if (threadIdx.x == 0)
691 inner_prod_buffer[blockIdx.x + chunk_offset] = shared_array[0];
694 template<
typename NumericT>
702 unsigned int size =
static_cast<unsigned int>(s.
size());
703 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
704 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
706 pipelined_bicgstab_update_s_kernel<<<256, 256>>>(detail::cuda_arg<NumericT>(s),
707 detail::cuda_arg<NumericT>(r),
708 detail::cuda_arg<NumericT>(Ap),
710 detail::cuda_arg<NumericT>(inner_prod_buffer),
716 template<
typename NumericT>
726 NumericT
const * r0star,
727 NumericT * inner_prod_buffer,
730 NumericT inner_prod_r_r0star = 0;
731 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
733 NumericT value_result = result[i];
734 NumericT value_p = p[i];
735 NumericT value_s = s[i];
736 NumericT value_residual = residual[i];
737 NumericT value_As = As[i];
738 NumericT value_Ap = Ap[i];
739 NumericT value_r0star = r0star[i];
741 value_result += alpha * value_p + omega * value_s;
742 value_residual = value_s - omega * value_As;
743 value_p = value_residual + beta * (value_p - omega * value_Ap);
745 result[i] = value_result;
746 residual[i] = value_residual;
748 inner_prod_r_r0star += value_residual * value_r0star;
752 __shared__ NumericT shared_array[256];
753 shared_array[threadIdx.x] = inner_prod_r_r0star;
758 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
762 if (threadIdx.x == 0)
763 inner_prod_buffer[blockIdx.x] = shared_array[0];
767 template<
typename NumericT>
774 (void)buffer_chunk_size;
775 unsigned int size =
static_cast<unsigned int>(result.
size());
777 pipelined_bicgstab_vector_kernel<<<256, 256>>>(detail::cuda_arg<NumericT>(result),
779 detail::cuda_arg<NumericT>(p),
781 detail::cuda_arg<NumericT>(s),
782 detail::cuda_arg<NumericT>(residual),
783 detail::cuda_arg<NumericT>(As),
785 detail::cuda_arg<NumericT>(Ap),
786 detail::cuda_arg<NumericT>(r0star),
787 detail::cuda_arg<NumericT>(inner_prod_buffer),
799 template<
typename NumericT>
801 const unsigned int * row_indices,
802 const unsigned int * column_indices,
803 const unsigned int * row_blocks,
804 const NumericT * elements,
805 unsigned int num_blocks,
808 const NumericT * r0star,
810 NumericT * inner_prod_buffer,
811 unsigned int buffer_size,
812 unsigned int buffer_offset)
814 NumericT inner_prod_ApAp = 0;
815 NumericT inner_prod_pAp = 0;
816 NumericT inner_prod_r0Ap = 0;
818 __shared__ NumericT shared_elements[1024];
820 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
822 unsigned int row_start = row_blocks[block_id];
823 unsigned int row_stop = row_blocks[block_id + 1];
824 unsigned int element_start = row_indices[row_start];
825 unsigned int element_stop = row_indices[row_stop];
826 unsigned int rows_to_process = row_stop - row_start;
828 if (rows_to_process > 1)
831 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
832 shared_elements[i - element_start] = elements[i] * p[column_indices[i]];
837 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
840 unsigned int thread_row_start = row_indices[
row] - element_start;
841 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
842 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
843 dot_prod += shared_elements[i];
845 inner_prod_ApAp += dot_prod *
dot_prod;
854 shared_elements[threadIdx.x] = 0;
855 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
856 shared_elements[threadIdx.x] += elements[i] * p[column_indices[i]];
863 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
866 if (threadIdx.x == 0)
868 Ap[row_start] = shared_elements[0];
869 inner_prod_ApAp += shared_elements[0] * shared_elements[0];
870 inner_prod_pAp += p[row_start] * shared_elements[0];
871 inner_prod_r0Ap += r0star[row_start] * shared_elements[0];
879 __shared__ NumericT shared_array_ApAp[256];
880 __shared__ NumericT shared_array_pAp[256];
881 __shared__ NumericT shared_array_r0Ap[256];
882 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
883 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
884 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
890 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
891 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
892 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
897 if (threadIdx.x == 0) {
898 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
899 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
900 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
907 template<
typename NumericT>
917 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
918 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
920 pipelined_bicgstab_csr_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
921 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
922 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
923 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
924 static_cast<unsigned int>(A.
blocks1()),
925 detail::cuda_arg<NumericT>(p),
926 detail::cuda_arg<NumericT>(Ap),
927 detail::cuda_arg<NumericT>(r0star),
929 detail::cuda_arg<NumericT>(inner_prod_buffer),
941 template<
typename NumericT>
943 const NumericT * elements,
944 const unsigned int * group_boundaries,
947 const NumericT * r0star,
949 NumericT * inner_prod_buffer,
950 unsigned int buffer_size,
951 unsigned int buffer_offset)
953 NumericT inner_prod_ApAp = 0;
954 NumericT inner_prod_pAp = 0;
955 NumericT inner_prod_r0Ap = 0;
956 __shared__
unsigned int shared_rows[128];
957 __shared__ NumericT inter_results[128];
961 unsigned int group_start = group_boundaries[blockIdx.x];
962 unsigned int group_end = group_boundaries[blockIdx.x + 1];
963 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
965 unsigned int local_index = 0;
967 for (
unsigned int k = 0; k < k_end; ++k)
969 local_index = group_start + k * blockDim.x + threadIdx.x;
971 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
972 val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0;
975 if (threadIdx.x == 0 && k > 0)
977 if (tmp.x == shared_rows[blockDim.x-1])
978 val += inter_results[blockDim.x-1];
981 NumericT Ap_entry = inter_results[blockDim.x-1];
982 Ap[shared_rows[blockDim.x-1]] = Ap_entry;
983 inner_prod_ApAp += Ap_entry * Ap_entry;
984 inner_prod_pAp += Ap_entry * p[shared_rows[blockDim.x-1]];
985 inner_prod_r0Ap += r0star[shared_rows[blockDim.x-1]] * Ap_entry;
991 shared_rows[threadIdx.x] = tmp.x;
992 inter_results[threadIdx.x] = val;
998 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1000 inter_results[threadIdx.x] += left;
1005 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1006 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1008 NumericT Ap_entry = inter_results[threadIdx.x];
1009 Ap[tmp.x] = Ap_entry;
1010 inner_prod_ApAp += Ap_entry * Ap_entry;
1011 inner_prod_pAp += Ap_entry * p[tmp.x];
1012 inner_prod_r0Ap += r0star[tmp.x] * Ap_entry;
1018 if (local_index + 1 == group_end)
1020 NumericT Ap_entry = inter_results[threadIdx.x];
1021 Ap[tmp.x] = Ap_entry;
1022 inner_prod_ApAp += Ap_entry * Ap_entry;
1023 inner_prod_pAp += Ap_entry * p[tmp.x];
1024 inner_prod_r0Ap += Ap_entry * r0star[tmp.x];
1028 __shared__ NumericT shared_array_ApAp[256];
1029 __shared__ NumericT shared_array_pAp[256];
1030 __shared__ NumericT shared_array_r0Ap[256];
1031 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1032 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1033 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1037 if (threadIdx.x <
stride)
1039 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1040 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1041 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1046 if (threadIdx.x == 0) {
1047 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1048 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1049 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1055 template<
typename NumericT>
1065 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1066 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1070 pipelined_bicgstab_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.
handle12().cuda_handle()),
1071 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
1072 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1073 detail::cuda_arg<NumericT>(p),
1074 detail::cuda_arg<NumericT>(Ap),
1075 detail::cuda_arg<NumericT>(r0star),
1077 detail::cuda_arg<NumericT>(inner_prod_buffer),
1089 template<
typename NumericT>
1091 const NumericT * elements,
1092 unsigned int internal_row_num,
1093 unsigned int items_per_row,
1096 const NumericT * r0star,
1098 NumericT * inner_prod_buffer,
1099 unsigned int buffer_size,
1100 unsigned int buffer_offset)
1102 NumericT inner_prod_ApAp = 0;
1103 NumericT inner_prod_pAp = 0;
1104 NumericT inner_prod_r0Ap = 0;
1105 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1106 unsigned int glb_sz = gridDim.x * blockDim.x;
1112 unsigned int offset =
row;
1113 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1115 NumericT val = elements[offset];
1116 sum += val ? p[coords[offset]] * val : NumericT(0);
1120 inner_prod_ApAp += sum *
sum;
1121 inner_prod_pAp += sum * p[
row];
1122 inner_prod_r0Ap += sum * r0star[
row];
1126 __shared__ NumericT shared_array_ApAp[256];
1127 __shared__ NumericT shared_array_pAp[256];
1128 __shared__ NumericT shared_array_r0Ap[256];
1129 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1130 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1131 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1135 if (threadIdx.x <
stride)
1137 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1138 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1139 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1144 if (threadIdx.x == 0) {
1145 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1146 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1147 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1152 template<
typename NumericT>
1162 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1163 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1165 pipelined_bicgstab_ell_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1166 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
1168 static_cast<unsigned int>(A.
maxnnz()),
1169 detail::cuda_arg<NumericT>(p),
1170 detail::cuda_arg<NumericT>(Ap),
1171 detail::cuda_arg<NumericT>(r0star),
1173 detail::cuda_arg<NumericT>(inner_prod_buffer),
1184 template<
typename NumericT>
1186 const unsigned int * column_indices,
1187 const unsigned int * block_start,
1188 const NumericT * elements,
1191 const NumericT * r0star,
1193 NumericT * inner_prod_buffer,
1194 unsigned int buffer_size,
1195 unsigned int buffer_offset)
1197 NumericT inner_prod_ApAp = 0;
1198 NumericT inner_prod_pAp = 0;
1199 NumericT inner_prod_r0Ap = 0;
1200 unsigned int local_id = threadIdx.x;
1201 unsigned int local_size = blockDim.x;
1203 for (
unsigned int block_idx = blockIdx.x; block_idx <= size / local_size; block_idx += gridDim.x)
1205 unsigned int row = block_idx * local_size + local_id;
1206 unsigned int offset = block_start[block_idx];
1207 unsigned int num_columns = columns_per_block[block_idx];
1210 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
1212 unsigned int index = offset + item_id * local_size + local_id;
1213 NumericT val = elements[index];
1215 sum += val ? (p[column_indices[index]] * val) : 0;
1221 inner_prod_ApAp += sum *
sum;
1222 inner_prod_pAp += sum * p[
row];
1223 inner_prod_r0Ap += sum * r0star[
row];
1228 __shared__ NumericT shared_array_ApAp[256];
1229 __shared__ NumericT shared_array_pAp[256];
1230 __shared__ NumericT shared_array_r0Ap[256];
1231 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1232 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1233 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1237 if (threadIdx.x <
stride)
1239 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1240 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1241 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1246 if (threadIdx.x == 0) {
1247 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1248 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1249 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1253 template<
typename NumericT>
1263 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1264 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1267 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1268 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1269 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
1270 detail::cuda_arg<NumericT>(p),
1271 detail::cuda_arg<NumericT>(Ap),
1272 detail::cuda_arg<NumericT>(r0star),
1274 detail::cuda_arg<NumericT>(inner_prod_buffer),
1286 template<
typename NumericT>
1288 const NumericT * ell_elements,
1289 const unsigned int * csr_rows,
1290 const unsigned int * csr_cols,
1291 const NumericT * csr_elements,
1292 unsigned int internal_row_num,
1293 unsigned int items_per_row,
1296 const NumericT * r0star,
1298 NumericT * inner_prod_buffer,
1299 unsigned int buffer_size,
1300 unsigned int buffer_offset)
1302 NumericT inner_prod_ApAp = 0;
1303 NumericT inner_prod_pAp = 0;
1304 NumericT inner_prod_r0Ap = 0;
1305 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1306 unsigned int glb_sz = gridDim.x * blockDim.x;
1312 unsigned int offset =
row;
1313 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1315 NumericT val = ell_elements[offset];
1317 sum += val ? p[ell_coords[offset]] * val : NumericT(0);
1320 unsigned int col_begin = csr_rows[
row];
1321 unsigned int col_end = csr_rows[
row + 1];
1323 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
1325 sum += p[csr_cols[item_id]] * csr_elements[item_id];
1329 inner_prod_ApAp += sum *
sum;
1330 inner_prod_pAp += sum * p[
row];
1331 inner_prod_r0Ap += sum * r0star[
row];
1335 __shared__ NumericT shared_array_ApAp[256];
1336 __shared__ NumericT shared_array_pAp[256];
1337 __shared__ NumericT shared_array_r0Ap[256];
1338 shared_array_ApAp[threadIdx.x] = inner_prod_ApAp;
1339 shared_array_pAp[threadIdx.x] = inner_prod_pAp;
1340 shared_array_r0Ap[threadIdx.x] = inner_prod_r0Ap;
1344 if (threadIdx.x <
stride)
1346 shared_array_ApAp[threadIdx.x] += shared_array_ApAp[threadIdx.x +
stride];
1347 shared_array_pAp[threadIdx.x] += shared_array_pAp[threadIdx.x +
stride];
1348 shared_array_r0Ap[threadIdx.x] += shared_array_r0Ap[threadIdx.x +
stride];
1353 if (threadIdx.x == 0) {
1354 inner_prod_buffer[ buffer_size + blockIdx.x] = shared_array_ApAp[0];
1355 inner_prod_buffer[2*buffer_size + blockIdx.x] = shared_array_pAp[0];
1356 inner_prod_buffer[buffer_offset + blockIdx.x] = shared_array_r0Ap[0];
1362 template<
typename NumericT>
1372 unsigned int chunk_size =
static_cast<unsigned int>(buffer_chunk_size);
1373 unsigned int chunk_offset =
static_cast<unsigned int>(buffer_chunk_offset);
1375 pipelined_bicgstab_hyb_vec_mul_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1376 detail::cuda_arg<NumericT>(A.
handle().cuda_handle()),
1377 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1378 detail::cuda_arg<unsigned int>(A.
handle4().cuda_handle()),
1379 detail::cuda_arg<NumericT>(A.
handle5().cuda_handle()),
1381 static_cast<unsigned int>(A.
ell_nnz()),
1382 detail::cuda_arg<NumericT>(p),
1383 detail::cuda_arg<NumericT>(Ap),
1384 detail::cuda_arg<NumericT>(r0star),
1386 detail::cuda_arg<NumericT>(inner_prod_buffer),
1394 template <
typename T>
1396 unsigned int vk_offset,
1399 unsigned int R_offset,
1400 T
const * inner_prod_buffer,
1401 unsigned int chunk_size,
1402 T * r_dot_vk_buffer,
1403 unsigned int chunk_offset,
1406 __shared__ T shared_array[128];
1410 shared_array[threadIdx.x] = inner_prod_buffer[threadIdx.x + chunk_size];
1414 if (threadIdx.x <
stride)
1415 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1420 norm_vk = sqrt(shared_array[0]);
1422 T inner_prod_contrib = 0;
1423 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x) {
1424 T value_vk = vk[i + vk_offset] / norm_vk;
1426 inner_prod_contrib += residual[i] * value_vk;
1428 vk[i + vk_offset] = value_vk;
1433 shared_array[threadIdx.x] = inner_prod_contrib;
1437 if (threadIdx.x <
stride)
1438 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1442 if (threadIdx.x == 0)
1443 r_dot_vk_buffer[blockIdx.x + chunk_offset] = shared_array[0];
1445 if (blockDim.x * blockIdx.x + threadIdx.x == 0)
1446 R_buffer[R_offset] = norm_vk;
1456 template <
typename T>
1467 unsigned int R_offset = offset_in_R;
1468 unsigned int chunk_size = buffer_chunk_size;
1469 unsigned int chunk_offset = buffer_chunk_offset;
1472 pipelined_gmres_normalize_vk_kernel<<<128, 128>>>(detail::cuda_arg<T>(v_k),
1474 detail::cuda_arg<T>(residual),
1475 detail::cuda_arg<T>(R_buffer),
1477 detail::cuda_arg<T>(inner_prod_buffer),
1479 detail::cuda_arg<T>(r_dot_vk_buffer),
1487 template <
typename T>
1492 T * vi_in_vk_buffer,
1493 unsigned int chunk_size)
1495 __shared__ T shared_array[7*128];
1498 unsigned int k_base = 0;
1501 unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1503 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1504 shared_array[threadIdx.x + j*chunk_size] = 0;
1506 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1510 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1511 shared_array[threadIdx.x + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) *
internal_size];
1518 if (threadIdx.x <
stride) {
1519 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1520 shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size +
stride];
1525 if (threadIdx.x == 0)
1526 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1527 vi_in_vk_buffer[blockIdx.x + (k_base + j) * chunk_size] = shared_array[j*chunk_size];
1529 k_base += vecs_in_iteration;
1534 template <
typename T>
1542 unsigned int chunk_size = buffer_chunk_size;
1543 unsigned int size = v_k_size;
1545 unsigned int k = param_k;
1547 pipelined_gmres_gram_schmidt_stage1_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1551 detail::cuda_arg<T>(vi_in_vk_buffer),
1559 template <
typename T>
1564 T
const * vi_in_vk_buffer,
1565 unsigned int chunk_size,
1567 unsigned int krylov_dim,
1568 T * inner_prod_buffer)
1570 __shared__ T shared_array[7*128];
1574 unsigned int k_base = 0;
1577 unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base);
1580 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1581 shared_array[threadIdx.x + j*chunk_size] = vi_in_vk_buffer[threadIdx.x + (k_base + j) * chunk_size];
1585 if (threadIdx.x <
stride) {
1586 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1587 shared_array[threadIdx.x + j*chunk_size] += shared_array[threadIdx.x + j*chunk_size +
stride];
1593 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1597 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1598 value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) *
internal_size];
1599 vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0;
1604 if (blockIdx.x == 0)
1605 for (
unsigned int j=0; j<vecs_in_iteration; ++j)
1606 R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size];
1609 k_base += vecs_in_iteration;
1613 shared_array[threadIdx.x] = vk_dot_vk;
1617 if (threadIdx.x <
stride)
1618 shared_array[threadIdx.x] += shared_array[threadIdx.x +
stride];
1622 if (threadIdx.x == 0)
1623 inner_prod_buffer[chunk_size+blockIdx.x] = shared_array[0];
1626 template <
typename T>
1637 unsigned int chunk_size = buffer_chunk_size;
1638 unsigned int size = v_k_size;
1640 unsigned int k = param_k;
1641 unsigned int krylov = krylov_dim;
1643 pipelined_gmres_gram_schmidt_stage2_kernel<<<128, 128>>>(detail::cuda_arg<T>(device_krylov_basis),
1647 detail::cuda_arg<T>(vi_in_vk_buffer),
1649 detail::cuda_arg<T>(R_buffer),
1651 detail::cuda_arg<T>(inner_prod_buffer));
1658 template <
typename T>
1661 T
const * krylov_basis,
1664 T
const * coefficients,
1667 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
1669 T value_result = result[i] + coefficients[0] * residual[i];
1671 for (
unsigned int j = 1; j < k; ++j)
1672 value_result += coefficients[j] * krylov_basis[i + (j-1)*
internal_size];
1674 result[i] = value_result;
1678 template <
typename T>
1687 unsigned int size = v_k_size;
1689 unsigned int k = param_k;
1691 pipelined_gmres_update_result_kernel<<<128, 128>>>(detail::cuda_arg<T>(result),
1692 detail::cuda_arg<T>(residual),
1693 detail::cuda_arg<T>(krylov_basis),
1696 detail::cuda_arg<T>(coefficients),
1703 template <
typename T>
1710 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1712 pipelined_cg_csr_vec_mul_kernel<<<128, 256>>>(detail::cuda_arg<unsigned int>(A.
handle1().cuda_handle()),
1713 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1714 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1715 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1716 static_cast<unsigned int>(A.
blocks1()),
1720 detail::cuda_arg<T>(inner_prod_buffer),
1721 buffer_size_per_vector);
1725 template <
typename T>
1732 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1736 pipelined_cg_coo_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(A.
handle12().cuda_handle()),
1737 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1738 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1742 detail::cuda_arg<T>(inner_prod_buffer),
1743 buffer_size_per_vector);
1747 template <
typename T>
1754 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1756 pipelined_cg_ell_vec_mul_kernel<<<128, 256>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1757 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1759 static_cast<unsigned int>(A.
maxnnz()),
1763 detail::cuda_arg<T>(inner_prod_buffer),
1764 buffer_size_per_vector);
1768 template <
typename T>
1775 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1778 detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1779 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1780 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1784 detail::cuda_arg<T>(inner_prod_buffer),
1785 buffer_size_per_vector);
1790 template <
typename T>
1797 unsigned int buffer_size_per_vector =
static_cast<unsigned int>(inner_prod_buffer.
size()) / static_cast<unsigned int>(3);
1799 pipelined_cg_hyb_vec_mul_kernel<<<128, 256>>>(detail::cuda_arg<unsigned int>(A.
handle2().cuda_handle()),
1800 detail::cuda_arg<T>(A.
handle().cuda_handle()),
1801 detail::cuda_arg<unsigned int>(A.
handle3().cuda_handle()),
1802 detail::cuda_arg<unsigned int>(A.
handle4().cuda_handle()),
1803 detail::cuda_arg<T>(A.
handle5().cuda_handle()),
1805 static_cast<unsigned int>(A.
ell_nnz()),
1809 detail::cuda_arg<T>(inner_prod_buffer),
1810 buffer_size_per_vector);
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
__global__ void pipelined_gmres_gram_schmidt_stage2_kernel(T *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T const *vi_in_vk_buffer, unsigned int chunk_size, T *R_buffer, unsigned int krylov_dim, T *inner_prod_buffer)
__global__ void pipelined_cg_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT *r, NumericT const *Ap, NumericT beta, NumericT *inner_prod_buffer, unsigned int size)
void pipelined_gmres_gram_schmidt_stage1(vector_base< T > const &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > &vi_in_vk_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const
const handle_type & handle() const
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t internal_size1() const
void pipelined_cg_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > &inner_prod_buffer)
__global__ void pipelined_gmres_normalize_vk_kernel(T *vk, unsigned int vk_offset, T const *residual, T *R_buffer, unsigned int R_offset, T const *inner_prod_buffer, unsigned int chunk_size, T *r_dot_vk_buffer, unsigned int chunk_offset, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
void pipelined_gmres_normalize_vk(vector_base< T > &v_k, vector_base< T > const &residual, vector_base< T > &R_buffer, vcl_size_t offset_in_R, vector_base< T > const &inner_prod_buffer, vector_base< T > &r_dot_vk_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
Performs a vector normalization needed for an efficient pipelined GMRES algorithm.
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const
void pipelined_bicgstab_prod(compressed_matrix< NumericT > const &A, vector_base< NumericT > const &p, vector_base< NumericT > &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t internal_size(vector_base< NumericT > const &vec)
Helper routine for obtaining the buffer length of a ViennaCL vector.
__global__ void pipelined_cg_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
vcl_size_t rows_per_block() const
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
void pipelined_gmres_prod(compressed_matrix< T > const &A, vector_base< T > const &p, vector_base< T > &Ap, vector_base< T > &inner_prod_buffer)
vcl_size_t internal_size1() const
void pipelined_gmres_update_result(vector_base< T > &result, vector_base< T > const &residual, vector_base< T > const &krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vector_base< T > const &coefficients, vcl_size_t param_k)
const handle_type & handle2() const
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
__global__ void pipelined_bicgstab_vector_kernel(NumericT *result, NumericT alpha, NumericT *p, NumericT omega, NumericT const *s, NumericT *residual, NumericT const *As, NumericT beta, NumericT const *Ap, NumericT const *r0star, NumericT *inner_prod_buffer, unsigned int size)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
__global__ void pipelined_cg_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_coo_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_cg_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
Sparse matrix class using the sliced ELLPACK with parameters C, .
result_of::size_type< T >::type start(T const &obj)
__global__ void pipelined_bicgstab_hyb_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void pipelined_bicgstab_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
void pipelined_cg_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, NumericT beta, vector_base< NumericT > &inner_prod_buffer)
vcl_size_t maxnnz() const
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
void pipelined_bicgstab_vector_update(vector_base< NumericT > &result, NumericT alpha, vector_base< NumericT > &p, NumericT omega, vector_base< NumericT > const &s, vector_base< NumericT > &residual, vector_base< NumericT > const &As, NumericT beta, vector_base< NumericT > const &Ap, vector_base< NumericT > const &r0star, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
void pipelined_gmres_gram_schmidt_stage2(vector_base< T > &device_krylov_basis, vcl_size_t v_k_size, vcl_size_t v_k_internal_size, vcl_size_t param_k, vector_base< T > const &vi_in_vk_buffer, vector_base< T > &R_buffer, vcl_size_t krylov_dim, vector_base< T > &inner_prod_buffer, vcl_size_t buffer_chunk_size)
__global__ void pipelined_gmres_update_result_kernel(T *result, T const *residual, T const *krylov_basis, unsigned int size, unsigned int internal_size, T const *coefficients, unsigned int k)
__global__ void pipelined_bicgstab_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
size_type size() const
Returns the length of the vector (cf. std::vector)
__global__ void pipelined_cg_sliced_ell_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
void pipelined_bicgstab_update_s(vector_base< NumericT > &s, vector_base< NumericT > &r, vector_base< NumericT > const &Ap, vector_base< NumericT > &inner_prod_buffer, vcl_size_t buffer_chunk_size, vcl_size_t buffer_chunk_offset)
vcl_size_t ell_nnz() const
__global__ void pipelined_gmres_gram_schmidt_stage1_kernel(T const *krylov_basis, unsigned int size, unsigned int internal_size, unsigned int k, T *vi_in_vk_buffer, unsigned int chunk_size)
__global__ void pipelined_cg_csr_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *p, NumericT *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size)
__global__ void pipelined_bicgstab_ell_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, unsigned int internal_row_num, unsigned int items_per_row, const NumericT *p, NumericT *Ap, const NumericT *r0star, unsigned int size, NumericT *inner_prod_buffer, unsigned int buffer_size, unsigned int buffer_offset)
__global__ void pipelined_bicgstab_update_s_kernel(NumericT *s, NumericT const *residual, NumericT const *Ap, unsigned int size, NumericT *inner_prod_buffer, unsigned int chunk_size, unsigned int chunk_offset)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
const handle_type & handle5() const
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
Implementation of the ViennaCL scalar class.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...