1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
46 template<
typename NumericT>
48 const unsigned int * row_indices,
49 const unsigned int * column_indices,
50 const NumericT * elements,
55 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
57 row += gridDim.x * blockDim.x)
60 unsigned int row_end = row_indices[
row+1];
65 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
66 value =
max(value, fabs(elements[i]));
70 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
71 value += fabs(elements[i]);
75 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
76 value += elements[i] * elements[i];
81 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
83 if (column_indices[i] ==
row)
99 template<
typename NumericT,
unsigned int AligmentV>
104 csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
105 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
106 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
107 detail::cuda_arg<NumericT>(vec),
108 static_cast<unsigned int>(mat.
size1()),
109 static_cast<unsigned int>(info_selector)
117 template<
typename NumericT>
119 const unsigned int * row_indices,
120 const unsigned int * column_indices,
121 const NumericT * elements,
123 unsigned int start_x,
126 unsigned int start_result,
127 unsigned int inc_result,
128 unsigned int size_result)
130 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
132 row += gridDim.x * blockDim.x)
135 unsigned int row_end = row_indices[
row+1];
136 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
137 dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138 result[
row * inc_result + start_result] =
dot_prod;
145 template<
typename NumericT>
147 const unsigned int * row_indices,
148 const unsigned int * column_indices,
149 const unsigned int * row_blocks,
150 const NumericT * elements,
151 unsigned int num_blocks,
153 unsigned int start_x,
156 unsigned int start_result,
157 unsigned int inc_result,
158 unsigned int size_result)
160 __shared__ NumericT shared_elements[1024];
162 for (
unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
164 unsigned int row_start = row_blocks[block_id];
165 unsigned int row_stop = row_blocks[block_id + 1];
166 unsigned int element_start = row_indices[row_start];
167 unsigned int element_stop = row_indices[row_stop];
168 unsigned int rows_to_process = row_stop - row_start;
170 if (rows_to_process > 1)
173 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
174 shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
179 for (
unsigned int row = row_start + threadIdx.x;
row < row_stop;
row += blockDim.x)
182 unsigned int thread_row_start = row_indices[
row] - element_start;
183 unsigned int thread_row_stop = row_indices[
row + 1] - element_start;
184 for (
unsigned int i = thread_row_start; i < thread_row_stop; ++i)
185 dot_prod += shared_elements[i];
186 result[
row * inc_result + start_result] =
dot_prod;
193 shared_elements[threadIdx.x] = 0;
194 for (
unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
195 shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
202 shared_elements[threadIdx.x] += shared_elements[threadIdx.x+
stride];
205 if (threadIdx.x == 0)
206 result[row_start * inc_result + start_result] = shared_elements[0];
224 template<
class NumericT,
unsigned int AlignmentV>
229 compressed_matrix_vec_mul_adaptive_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
230 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
231 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
232 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
233 static_cast<unsigned int>(mat.
blocks1()),
234 detail::cuda_arg<NumericT>(vec),
235 static_cast<unsigned int>(vec.
start()),
236 static_cast<unsigned int>(vec.
stride()),
237 detail::cuda_arg<NumericT>(result),
238 static_cast<unsigned int>(result.
start()),
239 static_cast<unsigned int>(result.
stride()),
240 static_cast<unsigned int>(result.
size())
249 template<
typename LayoutT>
252 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
253 unsigned int row_start,
unsigned int row_inc,
254 unsigned int col_start,
unsigned int col_inc,
255 unsigned int internal_rows,
unsigned int internal_cols)
257 return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
265 static __device__
unsigned int apply(
unsigned int i,
unsigned int j,
266 unsigned int row_start,
unsigned int row_inc,
267 unsigned int col_start,
unsigned int col_inc,
268 unsigned int internal_rows,
unsigned int internal_cols)
270 return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
276 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
278 const unsigned int * sp_mat_row_indices,
279 const unsigned int * sp_mat_col_indices,
280 const NumericT * sp_mat_elements,
281 const NumericT * d_mat,
282 unsigned int d_mat_row_start,
283 unsigned int d_mat_col_start,
284 unsigned int d_mat_row_inc,
285 unsigned int d_mat_col_inc,
286 unsigned int d_mat_row_size,
287 unsigned int d_mat_col_size,
288 unsigned int d_mat_internal_rows,
289 unsigned int d_mat_internal_cols,
291 unsigned int result_row_start,
292 unsigned int result_col_start,
293 unsigned int result_row_inc,
294 unsigned int result_col_inc,
295 unsigned int result_row_size,
296 unsigned int result_col_size,
297 unsigned int result_internal_rows,
298 unsigned int result_internal_cols)
300 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
302 unsigned int row_start = sp_mat_row_indices[
row];
303 unsigned int row_end = sp_mat_row_indices[
row+1];
305 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
309 for (
unsigned int k = row_start; k < row_end; k++)
311 unsigned int j = sp_mat_col_indices[k];
312 NumericT x = sp_mat_elements[k];
313 NumericT y = d_mat[ DMatIndexT::apply(j, col,
314 d_mat_row_start, d_mat_row_inc,
315 d_mat_col_start, d_mat_col_inc,
316 d_mat_internal_rows, d_mat_internal_cols) ];
321 result[ResultIndexT::apply(
row, col,
322 result_row_start, result_row_inc,
323 result_col_start, result_col_inc,
324 result_internal_rows, result_internal_cols)] = r;
338 template<
typename NumericT,
unsigned int AlignmentV>
346 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
347 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
348 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
350 detail::cuda_arg<NumericT>(d_mat),
356 detail::cuda_arg<NumericT>(result),
367 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
368 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
369 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
371 detail::cuda_arg<NumericT>(d_mat),
377 detail::cuda_arg<NumericT>(result),
388 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
389 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
390 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
392 detail::cuda_arg<NumericT>(d_mat),
398 detail::cuda_arg<NumericT>(result),
409 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
410 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
411 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
413 detail::cuda_arg<NumericT>(d_mat),
419 detail::cuda_arg<NumericT>(result),
430 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
432 const unsigned int * sp_mat_row_indices,
433 const unsigned int * sp_mat_col_indices,
434 const NumericT * sp_mat_elements,
435 const NumericT * d_mat,
436 unsigned int d_mat_row_start,
437 unsigned int d_mat_col_start,
438 unsigned int d_mat_row_inc,
439 unsigned int d_mat_col_inc,
440 unsigned int d_mat_row_size,
441 unsigned int d_mat_col_size,
442 unsigned int d_mat_internal_rows,
443 unsigned int d_mat_internal_cols,
445 unsigned int result_row_start,
446 unsigned int result_col_start,
447 unsigned int result_row_inc,
448 unsigned int result_col_inc,
449 unsigned int result_row_size,
450 unsigned int result_col_size,
451 unsigned int result_internal_rows,
452 unsigned int result_internal_cols)
454 for (
unsigned int row = blockIdx.x;
row < result_row_size;
row += gridDim.x)
456 unsigned int row_start = sp_mat_row_indices[
row];
457 unsigned int row_end = sp_mat_row_indices[
row+1];
459 for (
unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
463 for (
unsigned int k = row_start; k < row_end; k++)
465 unsigned int j = sp_mat_col_indices[k];
466 NumericT x = sp_mat_elements[k];
467 NumericT y = d_mat[ DMatIndexT::apply(col, j,
468 d_mat_row_start, d_mat_row_inc,
469 d_mat_col_start, d_mat_col_inc,
470 d_mat_internal_rows, d_mat_internal_cols) ];
475 result [ ResultIndexT::apply(
row, col,
476 result_row_start, result_row_inc,
477 result_col_start, result_col_inc,
478 result_internal_rows, result_internal_cols) ] = r;
493 template<
typename NumericT,
unsigned int AlignmentV>
501 if (d_mat.lhs().row_major() && result.
row_major())
504 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
505 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
506 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
508 detail::cuda_arg<NumericT>(d_mat.lhs()),
514 detail::cuda_arg<NumericT>(result),
522 else if (d_mat.lhs().row_major() && !result.
row_major())
525 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
526 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
527 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
529 detail::cuda_arg<NumericT>(d_mat.lhs()),
535 detail::cuda_arg<NumericT>(result),
543 else if (!d_mat.lhs().row_major() && result.
row_major())
546 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
547 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
548 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
550 detail::cuda_arg<NumericT>(d_mat.lhs()),
556 detail::cuda_arg<NumericT>(result),
567 (detail::cuda_arg<unsigned int>(sp_mat.
handle1().cuda_handle()),
568 detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
569 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
571 detail::cuda_arg<NumericT>(d_mat.lhs()),
577 detail::cuda_arg<NumericT>(result),
592 template<
typename NumericT>
594 const unsigned int * row_indices,
595 const unsigned int * column_indices,
596 const NumericT * elements,
600 for (
unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
602 row += gridDim.x * blockDim.x)
604 NumericT
diag = NumericT(0);
605 unsigned int row_end = row_indices[
row+1];
606 for (
unsigned int i = row_indices[
row]; i < row_end; ++i)
608 unsigned int col_index = column_indices[i];
609 if (col_index ==
row)
625 template<
typename SparseMatrixT,
typename NumericT>
631 csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
632 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
633 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
634 detail::cuda_arg<NumericT>(vec),
635 static_cast<unsigned int>(mat.size1())
646 template<
typename SparseMatrixT,
typename NumericT>
652 csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
653 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
654 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
655 detail::cuda_arg<NumericT>(vec),
656 static_cast<unsigned int>(mat.size1())
668 template<
typename SparseMatrixT,
typename NumericT>
674 csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
675 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
676 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
677 detail::cuda_arg<NumericT>(vec),
678 static_cast<unsigned int>(mat.size1())
689 template<
typename SparseMatrixT,
typename NumericT>
695 csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
696 detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
697 detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
698 detail::cuda_arg<NumericT>(vec),
699 static_cast<unsigned int>(mat.size1())
713 template<
typename SparseMatrixT,
typename NumericT>
719 csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
720 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
721 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
722 detail::cuda_arg<NumericT>(vec),
723 static_cast<unsigned int>(mat.
lhs().size1())
734 template<
typename SparseMatrixT,
typename NumericT>
742 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
743 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
744 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
745 detail::cuda_arg<NumericT>(diagonal),
746 static_cast<unsigned int>(mat.
size1())
749 csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
750 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
751 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
752 detail::cuda_arg<NumericT>(diagonal),
753 detail::cuda_arg<NumericT>(vec),
754 static_cast<unsigned int>(mat.
lhs().size1())
765 template<
typename SparseMatrixT,
typename NumericT>
771 csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
772 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
773 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
774 detail::cuda_arg<NumericT>(vec),
775 static_cast<unsigned int>(mat.
lhs().size1())
786 template<
typename SparseMatrixT,
typename NumericT>
794 compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
795 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
796 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
797 detail::cuda_arg<NumericT>(diagonal),
798 static_cast<unsigned int>(mat.
size1())
801 csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.
lhs().handle1().cuda_handle()),
802 detail::cuda_arg<unsigned int>(mat.
lhs().handle2().cuda_handle()),
803 detail::cuda_arg<NumericT>(mat.
lhs().handle().cuda_handle()),
804 detail::cuda_arg<NumericT>(diagonal),
805 detail::cuda_arg<NumericT>(vec),
806 static_cast<unsigned int>(mat.
lhs().size1())
816 template<
typename NumericT,
unsigned int AlignmentV>
825 csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
826 detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
827 detail::cuda_arg<NumericT>(L.lhs().handle().cuda_handle()),
828 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
829 detail::cuda_arg<NumericT>(vec),
830 static_cast<unsigned int>(L.lhs().size1())
835 template<
typename NumericT,
unsigned int AlignmentV>
844 csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
845 detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
846 detail::cuda_arg<NumericT>(U.lhs().handle().cuda_handle()),
847 detail::cuda_arg<NumericT>(U_diagonal.
handle().cuda_handle()),
848 detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
849 detail::cuda_arg<NumericT>(vec),
850 static_cast<unsigned int>(U.lhs().size1())
862 template<
typename NumericT>
864 const unsigned int * row_jumper,
865 const unsigned int * row_indices,
866 const unsigned int * column_indices,
867 const NumericT * elements,
868 unsigned int nonzero_rows,
870 unsigned int start_x,
873 unsigned int start_result,
874 unsigned int inc_result,
875 unsigned int size_result)
877 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
879 i += gridDim.x * blockDim.x)
881 result[i * inc_result + start_result] = 0;
884 for (
unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
886 i += gridDim.x * blockDim.x)
889 unsigned int row_end = row_jumper[i+1];
890 for (
unsigned int j = row_jumper[i]; j < row_end; ++j)
891 dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
892 result[row_indices[i] * inc_result + start_result] =
dot_prod;
905 template<
typename NumericT>
910 compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle1().cuda_handle()),
911 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
912 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
913 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
914 static_cast<unsigned int>(mat.
nnz1()),
915 detail::cuda_arg<NumericT>(vec),
916 static_cast<unsigned int>(vec.
start()),
917 static_cast<unsigned int>(vec.
stride()),
918 detail::cuda_arg<NumericT>(result),
919 static_cast<unsigned int>(result.
start()),
920 static_cast<unsigned int>(result.
stride()),
921 static_cast<unsigned int>(result.
size())
934 template<
typename NumericT>
936 const NumericT * elements,
937 const unsigned int * group_boundaries,
941 __shared__
unsigned int shared_rows[128];
942 __shared__ NumericT inter_results[128];
946 unsigned int last_index = blockDim.x - 1;
947 unsigned int group_start = group_boundaries[blockIdx.x];
948 unsigned int group_end = group_boundaries[blockIdx.x + 1];
949 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
951 unsigned int local_index = 0;
953 for (
unsigned int k = 0; k < k_end; ++k)
955 local_index = group_start + k * blockDim.x + threadIdx.x;
957 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
958 val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
961 if (threadIdx.x == 0 && k > 0)
963 if (tmp.x == shared_rows[last_index])
969 val =
max(val, fabs(inter_results[last_index]));
973 val = fabs(val) + inter_results[last_index];
977 val = sqrt(val * val + inter_results[last_index]);
991 result[shared_rows[last_index]] = inter_results[last_index];
995 result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
1004 shared_rows[threadIdx.x] = tmp.x;
1009 inter_results[threadIdx.x] = val;
1012 inter_results[threadIdx.x] = fabs(val);
1015 inter_results[threadIdx.x] = val * val;
1023 NumericT left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1029 inter_results[threadIdx.x] =
max(inter_results[threadIdx.x], left);
1033 inter_results[threadIdx.x] += left;
1037 inter_results[threadIdx.x] += left;
1047 if (threadIdx.x != last_index &&
1048 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1049 inter_results[threadIdx.x] != 0)
1051 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1057 if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
1058 result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1061 template<
typename NumericT,
unsigned int AlignmentV>
1066 coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
1067 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1068 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1069 detail::cuda_arg<NumericT>(vec),
1070 static_cast<unsigned int>(info_selector)
1078 template<
typename NumericT>
1080 const NumericT * elements,
1081 const unsigned int * group_boundaries,
1083 unsigned int start_x,
1086 unsigned int start_result,
1087 unsigned int inc_result
1090 __shared__
unsigned int shared_rows[128];
1091 __shared__ NumericT inter_results[128];
1095 unsigned int group_start = group_boundaries[blockIdx.x];
1096 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1097 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1099 unsigned int local_index = 0;
1101 for (
unsigned int k = 0; k < k_end; ++k)
1103 local_index = group_start + k * blockDim.x + threadIdx.x;
1105 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1106 val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1109 if (threadIdx.x == 0 && k > 0)
1111 if (tmp.x == shared_rows[blockDim.x-1])
1112 val += inter_results[blockDim.x-1];
1114 result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1119 shared_rows[threadIdx.x] = tmp.x;
1120 inter_results[threadIdx.x] = val;
1126 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1128 inter_results[threadIdx.x] += left;
1133 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1134 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1136 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1142 if (local_index + 1 == group_end)
1143 result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1155 template<
typename NumericT,
unsigned int AlignmentV>
1162 coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle12().cuda_handle()),
1163 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1164 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
1165 detail::cuda_arg<NumericT>(vec),
1166 static_cast<unsigned int>(vec.
start()),
1167 static_cast<unsigned int>(vec.
stride()),
1168 detail::cuda_arg<NumericT>(result),
1169 static_cast<unsigned int>(result.
start()),
1170 static_cast<unsigned int>(result.
stride())
1178 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1180 const NumericT * elements,
1181 const unsigned int * group_boundaries,
1182 const NumericT * d_mat,
1183 unsigned int d_mat_row_start,
1184 unsigned int d_mat_col_start,
1185 unsigned int d_mat_row_inc,
1186 unsigned int d_mat_col_inc,
1187 unsigned int d_mat_row_size,
1188 unsigned int d_mat_col_size,
1189 unsigned int d_mat_internal_rows,
1190 unsigned int d_mat_internal_cols,
1192 unsigned int result_row_start,
1193 unsigned int result_col_start,
1194 unsigned int result_row_inc,
1195 unsigned int result_col_inc,
1196 unsigned int result_row_size,
1197 unsigned int result_col_size,
1198 unsigned int result_internal_rows,
1199 unsigned int result_internal_cols)
1201 __shared__
unsigned int shared_rows[128];
1202 __shared__ NumericT inter_results[128];
1206 unsigned int group_start = group_boundaries[blockIdx.x];
1207 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1208 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1210 unsigned int local_index = 0;
1212 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1214 for (
unsigned int k = 0; k < k_end; ++k)
1216 local_index = group_start + k * blockDim.x + threadIdx.x;
1218 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1219 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1220 d_mat_row_start, d_mat_row_inc,
1221 d_mat_col_start, d_mat_col_inc,
1222 d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1225 if (threadIdx.x == 0 && k > 0)
1227 if (tmp.x == shared_rows[blockDim.x-1])
1228 val += inter_results[blockDim.x-1];
1230 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1231 result_row_start, result_row_inc,
1232 result_col_start, result_col_inc,
1233 result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1238 shared_rows[threadIdx.x] = tmp.x;
1239 inter_results[threadIdx.x] = val;
1245 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1247 inter_results[threadIdx.x] += left;
1252 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1253 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1255 result[ResultIndexT::apply(tmp.x, result_col,
1256 result_row_start, result_row_inc,
1257 result_col_start, result_col_inc,
1258 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1264 if (local_index + 1 == group_end)
1265 result[ResultIndexT::apply(tmp.x, result_col,
1266 result_row_start, result_row_inc,
1267 result_col_start, result_col_inc,
1268 result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1281 template<
typename NumericT,
unsigned int AlignmentV>
1289 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1290 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1291 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1293 detail::cuda_arg<NumericT>(d_mat),
1299 detail::cuda_arg<NumericT>(result),
1310 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1311 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1312 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1314 detail::cuda_arg<NumericT>(d_mat),
1320 detail::cuda_arg<NumericT>(result),
1331 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1332 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1333 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1335 detail::cuda_arg<NumericT>(d_mat),
1341 detail::cuda_arg<NumericT>(result),
1352 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1353 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1354 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1356 detail::cuda_arg<NumericT>(d_mat),
1362 detail::cuda_arg<NumericT>(result),
1373 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1375 const NumericT * elements,
1376 const unsigned int * group_boundaries,
1377 const NumericT * d_mat,
1378 unsigned int d_mat_row_start,
1379 unsigned int d_mat_col_start,
1380 unsigned int d_mat_row_inc,
1381 unsigned int d_mat_col_inc,
1382 unsigned int d_mat_row_size,
1383 unsigned int d_mat_col_size,
1384 unsigned int d_mat_internal_rows,
1385 unsigned int d_mat_internal_cols,
1387 unsigned int result_row_start,
1388 unsigned int result_col_start,
1389 unsigned int result_row_inc,
1390 unsigned int result_col_inc,
1391 unsigned int result_row_size,
1392 unsigned int result_col_size,
1393 unsigned int result_internal_rows,
1394 unsigned int result_internal_cols)
1396 __shared__
unsigned int shared_rows[128];
1397 __shared__ NumericT inter_results[128];
1401 unsigned int group_start = group_boundaries[blockIdx.x];
1402 unsigned int group_end = group_boundaries[blockIdx.x + 1];
1403 unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;
1405 unsigned int local_index = 0;
1407 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1409 for (
unsigned int k = 0; k < k_end; ++k)
1411 local_index = group_start + k * blockDim.x + threadIdx.x;
1413 tmp = (local_index < group_end) ? ((
const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1414 val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1415 d_mat_row_start, d_mat_row_inc,
1416 d_mat_col_start, d_mat_col_inc,
1417 d_mat_internal_rows, d_mat_internal_cols)] : 0;
1420 if (threadIdx.x == 0 && k > 0)
1422 if (tmp.x == shared_rows[blockDim.x-1])
1423 val += inter_results[blockDim.x-1];
1425 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1426 result_row_start, result_row_inc,
1427 result_col_start, result_col_inc,
1428 result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1433 shared_rows[threadIdx.x] = tmp.x;
1434 inter_results[threadIdx.x] = val;
1440 left = (threadIdx.x >=
stride && tmp.x == shared_rows[threadIdx.x -
stride]) ? inter_results[threadIdx.x -
stride] : 0;
1442 inter_results[threadIdx.x] += left;
1447 if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1448 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1450 result[ ResultIndexT::apply(tmp.x, result_col,
1451 result_row_start, result_row_inc,
1452 result_col_start, result_col_inc,
1453 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1459 if (local_index + 1 == group_end)
1460 result[ ResultIndexT::apply(tmp.x, result_col,
1461 result_row_start, result_row_inc,
1462 result_col_start, result_col_inc,
1463 result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1475 template<
typename NumericT,
unsigned int AlignmentV>
1482 if (d_mat.lhs().row_major() && result.
row_major())
1485 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1486 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1487 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1489 detail::cuda_arg<NumericT>(d_mat.lhs()),
1495 detail::cuda_arg<NumericT>(result),
1503 else if (d_mat.lhs().row_major() && !result.
row_major())
1506 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1507 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1508 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1510 detail::cuda_arg<NumericT>(d_mat.lhs()),
1516 detail::cuda_arg<NumericT>(result),
1524 else if (!d_mat.lhs().row_major() && result.
row_major())
1527 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1528 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1529 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1531 detail::cuda_arg<NumericT>(d_mat.lhs()),
1537 detail::cuda_arg<NumericT>(result),
1548 (detail::cuda_arg<unsigned int>(sp_mat.
handle12().cuda_handle()),
1549 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1550 detail::cuda_arg<unsigned int>(sp_mat.
handle3().cuda_handle()),
1552 detail::cuda_arg<NumericT>(d_mat.lhs()),
1558 detail::cuda_arg<NumericT>(result),
1573 template<
typename NumericT>
1575 const NumericT * elements,
1577 unsigned int start_x,
1580 unsigned int start_result,
1581 unsigned int inc_result,
1582 unsigned int row_num,
1583 unsigned int col_num,
1584 unsigned int internal_row_num,
1585 unsigned int items_per_row,
1586 unsigned int aligned_items_per_row
1589 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1590 unsigned int glb_sz = gridDim.x * blockDim.x;
1592 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1596 unsigned int offset = row_id;
1597 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1599 NumericT val = elements[offset];
1601 if (val != NumericT(0))
1603 int col = coords[offset];
1604 sum += x[col * inc_x + start_x] * val;
1608 result[row_id * inc_result + start_result] =
sum;
1621 template<
typename NumericT,
unsigned int AlignmentV>
1626 ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
1627 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
1628 detail::cuda_arg<NumericT>(vec),
1629 static_cast<unsigned int>(vec.
start()),
1630 static_cast<unsigned int>(vec.
stride()),
1631 detail::cuda_arg<NumericT>(result),
1632 static_cast<unsigned int>(result.
start()),
1633 static_cast<unsigned int>(result.
stride()),
1634 static_cast<unsigned int>(mat.
size1()),
1635 static_cast<unsigned int>(mat.
size2()),
1637 static_cast<unsigned int>(mat.
maxnnz()),
1643 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
1645 const NumericT * sp_mat_elements,
1646 unsigned int sp_mat_row_num,
1647 unsigned int sp_mat_col_num,
1648 unsigned int sp_mat_internal_row_num,
1649 unsigned int sp_mat_items_per_row,
1650 unsigned int sp_mat_aligned_items_per_row,
1651 const NumericT * d_mat,
1652 unsigned int d_mat_row_start,
1653 unsigned int d_mat_col_start,
1654 unsigned int d_mat_row_inc,
1655 unsigned int d_mat_col_inc,
1656 unsigned int d_mat_row_size,
1657 unsigned int d_mat_col_size,
1658 unsigned int d_mat_internal_rows,
1659 unsigned int d_mat_internal_cols,
1661 unsigned int result_row_start,
1662 unsigned int result_col_start,
1663 unsigned int result_row_inc,
1664 unsigned int result_col_inc,
1665 unsigned int result_row_size,
1666 unsigned int result_col_size,
1667 unsigned int result_internal_rows,
1668 unsigned int result_internal_cols)
1670 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1671 unsigned int glb_sz = gridDim.x * blockDim.x;
1673 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1675 unsigned int row = rc % sp_mat_row_num;
1676 unsigned int col = rc / sp_mat_row_num;
1678 unsigned int offset =
row;
1679 NumericT r = (NumericT)0;
1681 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1683 unsigned int j = sp_mat_coords[offset];
1684 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1686 if (x != (NumericT)0)
1688 NumericT y = d_mat[ DMatIndexT::apply(j, col,
1689 d_mat_row_start, d_mat_row_inc,
1690 d_mat_col_start, d_mat_col_inc,
1691 d_mat_internal_rows, d_mat_internal_cols) ];
1696 result [ ResultIndexT::apply(row, col,
1697 result_row_start, result_row_inc,
1698 result_col_start, result_col_inc,
1699 result_internal_rows, result_internal_cols) ] = r;
1713 template<
typename NumericT,
unsigned int AlignmentV>
1721 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1722 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1723 static_cast<unsigned int>(sp_mat.
size1()),
1724 static_cast<unsigned int>(sp_mat.
size2()),
1726 static_cast<unsigned int>(sp_mat.
maxnnz()),
1728 detail::cuda_arg<NumericT>(d_mat),
1734 detail::cuda_arg<NumericT>(result),
1745 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1746 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1747 static_cast<unsigned int>(sp_mat.
size1()),
1748 static_cast<unsigned int>(sp_mat.
size2()),
1750 static_cast<unsigned int>(sp_mat.
maxnnz()),
1752 detail::cuda_arg<NumericT>(d_mat),
1758 detail::cuda_arg<NumericT>(result),
1769 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1770 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1771 static_cast<unsigned int>(sp_mat.
size1()),
1772 static_cast<unsigned int>(sp_mat.
size2()),
1774 static_cast<unsigned int>(sp_mat.
maxnnz()),
1776 detail::cuda_arg<NumericT>(d_mat),
1782 detail::cuda_arg<NumericT>(result),
1793 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1794 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1795 static_cast<unsigned int>(sp_mat.
size1()),
1796 static_cast<unsigned int>(sp_mat.
size2()),
1798 static_cast<unsigned int>(sp_mat.
maxnnz()),
1800 detail::cuda_arg<NumericT>(d_mat),
1806 detail::cuda_arg<NumericT>(result),
1816 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT >
1818 const NumericT * sp_mat_elements,
1819 unsigned int sp_mat_row_num,
1820 unsigned int sp_mat_col_num,
1821 unsigned int sp_mat_internal_row_num,
1822 unsigned int sp_mat_items_per_row,
1823 unsigned int sp_mat_aligned_items_per_row,
1824 const NumericT * d_mat,
1825 unsigned int d_mat_row_start,
1826 unsigned int d_mat_col_start,
1827 unsigned int d_mat_row_inc,
1828 unsigned int d_mat_col_inc,
1829 unsigned int d_mat_row_size,
1830 unsigned int d_mat_col_size,
1831 unsigned int d_mat_internal_rows,
1832 unsigned int d_mat_internal_cols,
1834 unsigned int result_row_start,
1835 unsigned int result_col_start,
1836 unsigned int result_row_inc,
1837 unsigned int result_col_inc,
1838 unsigned int result_row_size,
1839 unsigned int result_col_size,
1840 unsigned int result_internal_rows,
1841 unsigned int result_internal_cols)
1843 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1844 unsigned int glb_sz = gridDim.x * blockDim.x;
1846 for (
unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1848 unsigned int row = rc % sp_mat_row_num;
1849 unsigned int col = rc / sp_mat_row_num;
1851 unsigned int offset =
row;
1852 NumericT r = (NumericT)0;
1854 for (
unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1856 unsigned int j = sp_mat_coords[offset];
1857 NumericT x =
static_cast<NumericT
>(sp_mat_elements[offset]);
1859 if (x != (NumericT)0)
1861 NumericT y = d_mat[ DMatIndexT::apply(col, j,
1862 d_mat_row_start, d_mat_row_inc,
1863 d_mat_col_start, d_mat_col_inc,
1864 d_mat_internal_rows, d_mat_internal_cols) ];
1869 result [ ResultIndexT::apply(row, col,
1870 result_row_start, result_row_inc,
1871 result_col_start, result_col_inc,
1872 result_internal_rows, result_internal_cols) ] = r;
1886 template<
typename NumericT,
unsigned int AlignmentV>
1893 if (d_mat.lhs().row_major() && result.
row_major())
1896 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1897 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1898 static_cast<unsigned int>(sp_mat.
size1()),
1899 static_cast<unsigned int>(sp_mat.
size2()),
1901 static_cast<unsigned int>(sp_mat.
maxnnz()),
1904 detail::cuda_arg<NumericT>(d_mat.lhs()),
1910 detail::cuda_arg<NumericT>(result),
1918 else if (d_mat.lhs().row_major() && !result.
row_major())
1921 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1922 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1923 static_cast<unsigned int>(sp_mat.
size1()),
1924 static_cast<unsigned int>(sp_mat.
size2()),
1926 static_cast<unsigned int>(sp_mat.
maxnnz()),
1929 detail::cuda_arg<NumericT>(d_mat.lhs()),
1935 detail::cuda_arg<NumericT>(result),
1943 else if (!d_mat.lhs().row_major() && result.
row_major())
1946 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1947 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1948 static_cast<unsigned int>(sp_mat.
size1()),
1949 static_cast<unsigned int>(sp_mat.
size2()),
1951 static_cast<unsigned int>(sp_mat.
maxnnz()),
1954 detail::cuda_arg<NumericT>(d_mat.lhs()),
1960 detail::cuda_arg<NumericT>(result),
1971 (detail::cuda_arg<unsigned int>(sp_mat.
handle2().cuda_handle()),
1972 detail::cuda_arg<NumericT>(sp_mat.
handle().cuda_handle()),
1973 static_cast<unsigned int>(sp_mat.
size1()),
1974 static_cast<unsigned int>(sp_mat.
size2()),
1976 static_cast<unsigned int>(sp_mat.
maxnnz()),
1979 detail::cuda_arg<NumericT>(d_mat.lhs()),
1985 detail::cuda_arg<NumericT>(result),
1999 template<
typename NumericT>
2001 const unsigned int * column_indices,
2002 const unsigned int * block_start,
2003 const NumericT * elements,
2005 unsigned int start_x,
2007 unsigned int size_x,
2009 unsigned int start_result,
2010 unsigned int inc_result,
2011 unsigned int size_result)
2013 unsigned int local_id = threadIdx.x;
2014 unsigned int local_size = blockDim.x;
2015 unsigned int num_rows = size_result;
2017 for (
unsigned int block_idx = blockIdx.x; block_idx <= num_rows / local_size; block_idx += gridDim.x)
2019 unsigned int row = block_idx * local_size + local_id;
2020 unsigned int offset = block_start[block_idx];
2021 unsigned int num_columns = columns_per_block[block_idx];
2024 for (
unsigned int item_id = 0; item_id < num_columns; item_id++)
2026 unsigned int index = offset + item_id * local_size + local_id;
2027 NumericT val = elements[index];
2029 sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2033 result[row * inc_result + start_result] =
sum;
2045 template<
typename NumericT,
typename IndexT>
2051 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2052 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2053 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2054 detail::cuda_arg<NumericT>(vec),
2055 static_cast<unsigned int>(vec.
start()),
2056 static_cast<unsigned int>(vec.
stride()),
2057 static_cast<unsigned int>(vec.
size()),
2058 detail::cuda_arg<NumericT>(result),
2059 static_cast<unsigned int>(result.
start()),
2060 static_cast<unsigned int>(result.
stride()),
2061 static_cast<unsigned int>(result.
size())
2072 template<
typename NumericT>
2074 const NumericT * ell_elements,
2075 const unsigned int * csr_rows,
2076 const unsigned int * csr_cols,
2077 const NumericT * csr_elements,
2079 unsigned int start_x,
2082 unsigned int start_result,
2083 unsigned int inc_result,
2084 unsigned int row_num,
2085 unsigned int internal_row_num,
2086 unsigned int items_per_row,
2087 unsigned int aligned_items_per_row
2090 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2091 unsigned int glb_sz = gridDim.x * blockDim.x;
2093 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2097 unsigned int offset = row_id;
2098 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2100 NumericT val = ell_elements[offset];
2103 if (val != NumericT(0))
2105 int col = ell_coords[offset];
2106 sum += (x[col * inc_x + start_x] * val);
2110 unsigned int col_begin = csr_rows[row_id];
2111 unsigned int col_end = csr_rows[row_id + 1];
2113 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2114 sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2116 result[row_id * inc_result + start_result] =
sum;
2130 template<
typename NumericT,
unsigned int AlignmentV>
2135 hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2136 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2137 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2138 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2139 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2140 detail::cuda_arg<NumericT>(vec),
2141 static_cast<unsigned int>(vec.
start()),
2142 static_cast<unsigned int>(vec.
stride()),
2143 detail::cuda_arg<NumericT>(result),
2144 static_cast<unsigned int>(result.
start()),
2145 static_cast<unsigned int>(result.
stride()),
2146 static_cast<unsigned int>(mat.
size1()),
2148 static_cast<unsigned int>(mat.
ell_nnz()),
2156 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2158 const NumericT * ell_elements,
2159 const unsigned int * csr_rows,
2160 const unsigned int * csr_cols,
2161 const NumericT * csr_elements,
2162 unsigned int row_num,
2163 unsigned int internal_row_num,
2164 unsigned int items_per_row,
2165 unsigned int aligned_items_per_row,
2166 const NumericT * d_mat,
2167 unsigned int d_mat_row_start,
2168 unsigned int d_mat_col_start,
2169 unsigned int d_mat_row_inc,
2170 unsigned int d_mat_col_inc,
2171 unsigned int d_mat_row_size,
2172 unsigned int d_mat_col_size,
2173 unsigned int d_mat_internal_rows,
2174 unsigned int d_mat_internal_cols,
2176 unsigned int result_row_start,
2177 unsigned int result_col_start,
2178 unsigned int result_row_inc,
2179 unsigned int result_col_inc,
2180 unsigned int result_row_size,
2181 unsigned int result_col_size,
2182 unsigned int result_internal_rows,
2183 unsigned int result_internal_cols)
2185 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2186 unsigned int glb_sz = gridDim.x * blockDim.x;
2188 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2190 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2194 unsigned int offset = row_id;
2195 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2197 NumericT val = ell_elements[offset];
2201 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2202 d_mat_row_start, d_mat_row_inc,
2203 d_mat_col_start, d_mat_col_inc,
2204 d_mat_internal_rows, d_mat_internal_cols)] * val;
2208 unsigned int col_begin = csr_rows[row_id];
2209 unsigned int col_end = csr_rows[row_id + 1];
2211 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2213 sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2214 d_mat_row_start, d_mat_row_inc,
2215 d_mat_col_start, d_mat_col_inc,
2216 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2219 result[ResultIndexT::apply(row_id, result_col,
2220 result_row_start, result_row_inc,
2221 result_col_start, result_col_inc,
2222 result_internal_rows, result_internal_cols)] =
sum;
2237 template<
typename NumericT,
unsigned int AlignmentV>
2245 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2246 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2247 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2248 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2249 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2250 static_cast<unsigned int>(mat.
size1()),
2252 static_cast<unsigned int>(mat.
ell_nnz()),
2255 detail::cuda_arg<NumericT>(d_mat),
2261 detail::cuda_arg<NumericT>(result),
2272 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2273 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2274 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2275 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2276 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2277 static_cast<unsigned int>(mat.
size1()),
2279 static_cast<unsigned int>(mat.
ell_nnz()),
2282 detail::cuda_arg<NumericT>(d_mat),
2288 detail::cuda_arg<NumericT>(result),
2299 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2300 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2301 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2302 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2303 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2304 static_cast<unsigned int>(mat.
size1()),
2306 static_cast<unsigned int>(mat.
ell_nnz()),
2309 detail::cuda_arg<NumericT>(d_mat),
2315 detail::cuda_arg<NumericT>(result),
2326 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2327 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2328 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2329 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2330 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2331 static_cast<unsigned int>(mat.
size1()),
2333 static_cast<unsigned int>(mat.
ell_nnz()),
2336 detail::cuda_arg<NumericT>(d_mat),
2342 detail::cuda_arg<NumericT>(result),
2354 template<
typename DMatIndexT,
typename ResultIndexT,
typename NumericT>
2356 const NumericT * ell_elements,
2357 const unsigned int * csr_rows,
2358 const unsigned int * csr_cols,
2359 const NumericT * csr_elements,
2360 unsigned int row_num,
2361 unsigned int internal_row_num,
2362 unsigned int items_per_row,
2363 unsigned int aligned_items_per_row,
2364 const NumericT * d_mat,
2365 unsigned int d_mat_row_start,
2366 unsigned int d_mat_col_start,
2367 unsigned int d_mat_row_inc,
2368 unsigned int d_mat_col_inc,
2369 unsigned int d_mat_row_size,
2370 unsigned int d_mat_col_size,
2371 unsigned int d_mat_internal_rows,
2372 unsigned int d_mat_internal_cols,
2374 unsigned int result_row_start,
2375 unsigned int result_col_start,
2376 unsigned int result_row_inc,
2377 unsigned int result_col_inc,
2378 unsigned int result_row_size,
2379 unsigned int result_col_size,
2380 unsigned int result_internal_rows,
2381 unsigned int result_internal_cols)
2383 unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2384 unsigned int glb_sz = gridDim.x * blockDim.x;
2386 for (
unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2388 for (
unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2392 unsigned int offset = row_id;
2393 for (
unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2395 NumericT val = ell_elements[offset];
2399 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2400 d_mat_row_start, d_mat_row_inc,
2401 d_mat_col_start, d_mat_col_inc,
2402 d_mat_internal_rows, d_mat_internal_cols)] * val;
2406 unsigned int col_begin = csr_rows[row_id];
2407 unsigned int col_end = csr_rows[row_id + 1];
2409 for (
unsigned int item_id = col_begin; item_id < col_end; item_id++)
2411 sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2412 d_mat_row_start, d_mat_row_inc,
2413 d_mat_col_start, d_mat_col_inc,
2414 d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2417 result[ResultIndexT::apply(row_id, result_col,
2418 result_row_start, result_row_inc,
2419 result_col_start, result_col_inc,
2420 result_internal_rows, result_internal_cols)] =
sum;
2435 template<
typename NumericT,
unsigned int AlignmentV>
2442 if (d_mat.lhs().row_major() && result.
row_major())
2445 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2446 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2447 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2448 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2449 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2450 static_cast<unsigned int>(mat.
size1()),
2452 static_cast<unsigned int>(mat.
ell_nnz()),
2455 detail::cuda_arg<NumericT>(d_mat.lhs()),
2461 detail::cuda_arg<NumericT>(result),
2469 else if (d_mat.lhs().row_major() && !result.
row_major())
2472 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2473 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2474 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2475 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2476 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2477 static_cast<unsigned int>(mat.
size1()),
2479 static_cast<unsigned int>(mat.
ell_nnz()),
2482 detail::cuda_arg<NumericT>(d_mat.lhs()),
2488 detail::cuda_arg<NumericT>(result),
2496 else if (!d_mat.lhs().row_major() && result.
row_major())
2499 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2500 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2501 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2502 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2503 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2504 static_cast<unsigned int>(mat.
size1()),
2506 static_cast<unsigned int>(mat.
ell_nnz()),
2509 detail::cuda_arg<NumericT>(d_mat.lhs()),
2515 detail::cuda_arg<NumericT>(result),
2526 detail::cuda_arg<unsigned int>(mat.
handle2().cuda_handle()),
2527 detail::cuda_arg<NumericT>(mat.
handle().cuda_handle()),
2528 detail::cuda_arg<unsigned int>(mat.
handle3().cuda_handle()),
2529 detail::cuda_arg<unsigned int>(mat.
handle4().cuda_handle()),
2530 detail::cuda_arg<NumericT>(mat.
handle5().cuda_handle()),
2531 static_cast<unsigned int>(mat.
size1()),
2533 static_cast<unsigned int>(mat.
ell_nnz()),
2536 detail::cuda_arg<NumericT>(d_mat.lhs()),
2542 detail::cuda_arg<NumericT>(result),
vcl_size_t internal_ellnnz() const
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Simple enable-if variant that uses the SFINAE pattern.
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
const handle_type & handle3() const
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
const handle_type & handle() const
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
A tag class representing a lower triangular matrix.
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size1() const
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Expression template class for representing a tree of expressions which ultimately result in a matrix...
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
const handle_type & handle4() const
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
vcl_size_t size1() const
Returns the size of the result vector.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
const handle_type & handle2() const
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
Sparse matrix class using the ELLPACK format for storing the nonzeros.
A tag class representing an upper triangular matrix.
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
NumericT max(std::vector< NumericT > const &v1)
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
vcl_size_t ell_nnz() const
A tag class representing a lower triangular matrix with unit diagonal.
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag for column-major storage of a dense matrix.
const handle_type & handle5() const
LHS & lhs() const
Get left hand side operand.
size_type start() const
Returns the offset within the buffer.
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
A tag class representing an upper triangular matrix with unit diagonal.
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)