1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_HPP_
56 template<
typename NumericT,
typename SizeT,
typename DistanceT>
60 trans_kernel<<<128,128>>>(detail::cuda_arg<NumericT>(proxy.lhs()),
61 static_cast<unsigned int>(proxy.lhs().start1()), static_cast<unsigned int>(proxy.lhs().start2()),
62 static_cast<unsigned int>(proxy.lhs().internal_size1()), static_cast<unsigned int>(proxy.lhs().internal_size2()),
63 static_cast<unsigned int>(proxy.lhs().size1()), static_cast<unsigned int>(proxy.lhs().size2()),
64 static_cast<unsigned int>(proxy.lhs().stride1()), static_cast<unsigned int>(proxy.lhs().stride2()),
66 detail::cuda_arg<NumericT>(temp_trans),
67 static_cast<unsigned int>(temp_trans.
start1()), static_cast<unsigned int>(temp_trans.
start2()),
69 static_cast<unsigned int>(temp_trans.
stride1()), static_cast<unsigned int>(temp_trans.
stride2()),
70 static_cast<bool>(proxy.lhs().row_major()));
75 template<
typename NumericT,
typename ScalarT>
79 assert(mat1.
row_major() == mat2.
row_major() && bool(
"Addition/subtraction on mixed matrix layouts not supported yet!"));
81 typedef NumericT value_type;
85 value_type temporary_alpha = 0;
87 temporary_alpha = alpha;
91 am_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
99 detail::cuda_arg<value_type>(mat2),
108 am_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
116 detail::cuda_arg<value_type>(mat2),
126 template<
typename NumericT,
typename ScalarT1,
typename ScalarT2>
133 typedef NumericT value_type;
135 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
137 value_type temporary_alpha = 0;
139 temporary_alpha = alpha;
144 value_type temporary_beta = 0;
146 temporary_beta = beta;
151 ambm_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
159 detail::cuda_arg<value_type>(mat2),
166 detail::cuda_arg<value_type>(mat3),
175 ambm_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
183 detail::cuda_arg<value_type>(mat2),
190 detail::cuda_arg<value_type>(mat3),
201 template<
typename NumericT,
typename ScalarT1,
typename ScalarT2>
208 typedef NumericT value_type;
210 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
212 value_type temporary_alpha = 0;
214 temporary_alpha = alpha;
219 value_type temporary_beta = 0;
221 temporary_beta = beta;
226 ambm_m_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
234 detail::cuda_arg<value_type>(mat2),
241 detail::cuda_arg<value_type>(mat3),
250 ambm_m_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
258 detail::cuda_arg<value_type>(mat2),
265 detail::cuda_arg<value_type>(mat3),
278 template<
typename NumericT>
281 typedef NumericT value_type;
282 value_type alpha = s;
290 matrix_row_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
294 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
300 matrix_col_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
304 static_cast<unsigned int>(viennacl::traits::internal_size1(mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(mat)),
310 template<
typename NumericT>
313 typedef NumericT value_type;
314 value_type alpha = s;
318 matrix_row_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
328 matrix_col_diagonal_assign_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
339 template<
typename NumericT>
342 typedef NumericT value_type;
348 unsigned int options_alpha = 0;
378 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
379 static_cast<unsigned int>(mat_start),
380 static_cast<unsigned int>(mat_stride),
381 static_cast<unsigned int>(mat_size),
383 detail::cuda_arg<value_type>(NumericT(1)),
385 detail::cuda_arg<value_type>(vec),
391 template<
typename NumericT>
394 typedef NumericT value_type;
396 unsigned int options_alpha = 0;
425 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
430 detail::cuda_arg<value_type>(NumericT(1)),
432 detail::cuda_arg<value_type>(mat),
433 static_cast<unsigned int>(mat_start),
434 static_cast<unsigned int>(mat_stride));
438 template<
typename NumericT>
441 typedef NumericT value_type;
443 unsigned int options_alpha = 0;
458 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
463 detail::cuda_arg<value_type>(NumericT(1)),
465 detail::cuda_arg<value_type>(mat),
466 static_cast<unsigned int>(mat_start),
467 static_cast<unsigned int>(mat_stride));
471 template<
typename NumericT>
474 typedef NumericT value_type;
476 unsigned int options_alpha = 0;
491 av_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(vec),
496 detail::cuda_arg<value_type>(NumericT(1)),
498 detail::cuda_arg<value_type>(mat),
499 static_cast<unsigned int>(mat_start),
500 static_cast<unsigned int>(mat_stride));
510 template<
typename NumericT,
typename SizeT,
typename OpT>
514 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
516 typedef NumericT value_type;
518 unsigned int op_type = 2;
526 element_op_int_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
532 detail::cuda_arg<value_type>(proxy.lhs()),
537 detail::cuda_arg<value_type>(proxy.rhs()),
548 element_op_int_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
554 detail::cuda_arg<value_type>(proxy.lhs()),
559 detail::cuda_arg<value_type>(proxy.rhs()),
570 template<
typename SizeT,
typename OpT>
574 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
576 typedef float value_type;
578 unsigned int op_type = 2;
586 element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
592 detail::cuda_arg<value_type>(proxy.lhs()),
597 detail::cuda_arg<value_type>(proxy.rhs()),
608 element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
614 detail::cuda_arg<value_type>(proxy.lhs()),
619 detail::cuda_arg<value_type>(proxy.rhs()),
630 template<
typename SizeT,
typename OpT>
634 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
636 typedef double value_type;
638 unsigned int op_type = 2;
646 element_op_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
652 detail::cuda_arg<value_type>(proxy.lhs()),
657 detail::cuda_arg<value_type>(proxy.rhs()),
668 element_op_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
674 detail::cuda_arg<value_type>(proxy.lhs()),
679 detail::cuda_arg<value_type>(proxy.rhs()),
698 template<
typename NumericT>
702 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
704 typedef NumericT value_type;
708 matrix_row_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
714 detail::cuda_arg<value_type>(proxy.lhs()),
723 matrix_col_element_abs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
729 detail::cuda_arg<value_type>(proxy.lhs()),
740 template<
typename NumericT>
744 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
746 typedef NumericT value_type;
750 matrix_row_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
756 detail::cuda_arg<value_type>(proxy.lhs()),
765 matrix_col_element_acos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
771 detail::cuda_arg<value_type>(proxy.lhs()),
782 template<
typename NumericT>
786 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
788 typedef NumericT value_type;
792 matrix_row_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
798 detail::cuda_arg<value_type>(proxy.lhs()),
807 matrix_col_element_asin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
813 detail::cuda_arg<value_type>(proxy.lhs()),
824 template<
typename NumericT>
828 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
830 typedef NumericT value_type;
834 matrix_row_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
840 detail::cuda_arg<value_type>(proxy.lhs()),
849 matrix_col_element_atan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
855 detail::cuda_arg<value_type>(proxy.lhs()),
866 template<
typename NumericT>
870 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
872 typedef NumericT value_type;
876 matrix_row_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
882 detail::cuda_arg<value_type>(proxy.lhs()),
891 matrix_col_element_ceil_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
897 detail::cuda_arg<value_type>(proxy.lhs()),
908 template<
typename NumericT>
912 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
914 typedef NumericT value_type;
918 matrix_row_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
924 detail::cuda_arg<value_type>(proxy.lhs()),
933 matrix_col_element_cos_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
939 detail::cuda_arg<value_type>(proxy.lhs()),
950 template<
typename NumericT>
954 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
956 typedef NumericT value_type;
960 matrix_row_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
966 detail::cuda_arg<value_type>(proxy.lhs()),
975 matrix_col_element_cosh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
981 detail::cuda_arg<value_type>(proxy.lhs()),
992 template<
typename NumericT>
996 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
998 typedef NumericT value_type;
1002 matrix_row_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1008 detail::cuda_arg<value_type>(proxy.lhs()),
1017 matrix_col_element_exp_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1023 detail::cuda_arg<value_type>(proxy.lhs()),
1034 template<
typename NumericT>
1038 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1040 typedef NumericT value_type;
1044 matrix_row_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1050 detail::cuda_arg<value_type>(proxy.lhs()),
1059 matrix_col_element_fabs_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1065 detail::cuda_arg<value_type>(proxy.lhs()),
1076 template<
typename NumericT>
1080 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1082 typedef NumericT value_type;
1086 matrix_row_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1092 detail::cuda_arg<value_type>(proxy.lhs()),
1101 matrix_col_element_floor_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1107 detail::cuda_arg<value_type>(proxy.lhs()),
1118 template<
typename NumericT>
1122 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1124 typedef NumericT value_type;
1128 matrix_row_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1134 detail::cuda_arg<value_type>(proxy.lhs()),
1143 matrix_col_element_log_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1149 detail::cuda_arg<value_type>(proxy.lhs()),
1160 template<
typename NumericT>
1164 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1166 typedef NumericT value_type;
1170 matrix_row_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1176 detail::cuda_arg<value_type>(proxy.lhs()),
1185 matrix_col_element_log10_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1191 detail::cuda_arg<value_type>(proxy.lhs()),
1202 template<
typename NumericT>
1206 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1208 typedef NumericT value_type;
1212 matrix_row_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1218 detail::cuda_arg<value_type>(proxy.lhs()),
1227 matrix_col_element_sin_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1233 detail::cuda_arg<value_type>(proxy.lhs()),
1244 template<
typename NumericT>
1248 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1250 typedef NumericT value_type;
1254 matrix_row_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1260 detail::cuda_arg<value_type>(proxy.lhs()),
1269 matrix_col_element_sinh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1275 detail::cuda_arg<value_type>(proxy.lhs()),
1286 template<
typename NumericT>
1290 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1292 typedef NumericT value_type;
1296 matrix_row_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1302 detail::cuda_arg<value_type>(proxy.lhs()),
1311 matrix_col_element_sqrt_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1317 detail::cuda_arg<value_type>(proxy.lhs()),
1328 template<
typename NumericT>
1332 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1334 typedef NumericT value_type;
1338 matrix_row_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1344 detail::cuda_arg<value_type>(proxy.lhs()),
1353 matrix_col_element_tan_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1359 detail::cuda_arg<value_type>(proxy.lhs()),
1370 template<
typename NumericT>
1374 assert(A.
row_major() == proxy.lhs().row_major() && A.
row_major() == proxy.rhs().row_major() && bool(
"Element-wise operations on mixed matrix layouts not supported yet!"));
1376 typedef NumericT value_type;
1380 matrix_row_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1386 detail::cuda_arg<value_type>(proxy.lhs()),
1395 matrix_col_element_tanh_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(A),
1401 detail::cuda_arg<value_type>(proxy.lhs()),
1426 template<
typename NumericT>
1431 typedef NumericT value_type;
1439 vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1445 detail::cuda_arg<value_type>(vec),
1450 detail::cuda_arg<value_type>(result),
1459 trans_vec_mul_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1465 detail::cuda_arg<value_type>(vec),
1470 detail::cuda_arg<value_type>(result),
1482 vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1488 detail::cuda_arg<value_type>(vec),
1493 detail::cuda_arg<value_type>(result),
1502 trans_vec_mul_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat),
1508 detail::cuda_arg<value_type>(vec),
1513 detail::cuda_arg<value_type>(result),
1531 template<
typename MatrixT1,
typename MatrixT2,
typename MatrixT3,
typename ScalarT>
1533 const MatrixT2 & B,
bool transposed_B,
1540 cpu_value_type converted_alpha =
static_cast<cpu_value_type
>(alpha);
1541 cpu_value_type converted_beta =
static_cast<cpu_value_type
>(beta);
1543 dim3 threads(16, 16);
1547 bool row_major_A = A.row_major();
1548 bool row_major_B = B.row_major();
1549 bool row_major_C = C.row_major();
1552 if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1554 matrix_matrix_col_col_col_prod_AA_kernel<<<grid, threads>>>
1556 detail::cuda_arg<cpu_value_type>(A),
1562 detail::cuda_arg<cpu_value_type>(B),
1569 detail::cuda_arg<cpu_value_type>(C),
1575 else if (!row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1577 matrix_matrix_col_col_col_prod_AT_kernel<<<grid, threads>>>
1579 detail::cuda_arg<cpu_value_type>(A),
1585 detail::cuda_arg<cpu_value_type>(B),
1592 detail::cuda_arg<cpu_value_type>(C),
1598 else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1600 matrix_matrix_col_col_col_prod_TA_kernel<<<grid, threads>>>
1602 detail::cuda_arg<cpu_value_type>(A),
1608 detail::cuda_arg<cpu_value_type>(B),
1615 detail::cuda_arg<cpu_value_type>(C),
1621 else if (!row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1623 matrix_matrix_col_col_col_prod_TT_kernel<<<grid, threads>>>
1625 detail::cuda_arg<cpu_value_type>(A),
1631 detail::cuda_arg<cpu_value_type>(B),
1638 detail::cuda_arg<cpu_value_type>(C),
1646 else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
1648 matrix_matrix_col_col_row_prod_AA_kernel<<<grid, threads>>>
1650 detail::cuda_arg<cpu_value_type>(A),
1656 detail::cuda_arg<cpu_value_type>(B),
1663 detail::cuda_arg<cpu_value_type>(C),
1669 else if (!row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
1671 matrix_matrix_col_col_row_prod_AT_kernel<<<grid, threads>>>
1673 detail::cuda_arg<cpu_value_type>(A),
1679 detail::cuda_arg<cpu_value_type>(B),
1686 detail::cuda_arg<cpu_value_type>(C),
1692 else if (!row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
1694 matrix_matrix_col_col_row_prod_TA_kernel<<<grid, threads>>>
1696 detail::cuda_arg<cpu_value_type>(A),
1702 detail::cuda_arg<cpu_value_type>(B),
1709 detail::cuda_arg<cpu_value_type>(C),
1715 else if (!row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
1717 matrix_matrix_col_col_row_prod_TT_kernel<<<grid, threads>>>
1719 detail::cuda_arg<cpu_value_type>(A),
1725 detail::cuda_arg<cpu_value_type>(B),
1732 detail::cuda_arg<cpu_value_type>(C),
1740 else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
1742 matrix_matrix_col_row_col_prod_AA_kernel<<<grid, threads>>>
1744 detail::cuda_arg<cpu_value_type>(A),
1750 detail::cuda_arg<cpu_value_type>(B),
1757 detail::cuda_arg<cpu_value_type>(C),
1763 else if (!row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
1765 matrix_matrix_col_row_col_prod_AT_kernel<<<grid, threads>>>
1767 detail::cuda_arg<cpu_value_type>(A),
1773 detail::cuda_arg<cpu_value_type>(B),
1780 detail::cuda_arg<cpu_value_type>(C),
1786 else if (!row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
1788 matrix_matrix_col_row_col_prod_TA_kernel<<<grid, threads>>>
1790 detail::cuda_arg<cpu_value_type>(A),
1796 detail::cuda_arg<cpu_value_type>(B),
1803 detail::cuda_arg<cpu_value_type>(C),
1809 else if (!row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
1811 matrix_matrix_col_row_col_prod_TT_kernel<<<grid, threads>>>
1813 detail::cuda_arg<cpu_value_type>(A),
1819 detail::cuda_arg<cpu_value_type>(B),
1826 detail::cuda_arg<cpu_value_type>(C),
1834 else if (!row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
1836 matrix_matrix_col_row_row_prod_AA_kernel<<<grid, threads>>>
1838 detail::cuda_arg<cpu_value_type>(A),
1844 detail::cuda_arg<cpu_value_type>(B),
1851 detail::cuda_arg<cpu_value_type>(C),
1857 else if (!row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
1859 matrix_matrix_col_row_row_prod_AT_kernel<<<grid, threads>>>
1861 detail::cuda_arg<cpu_value_type>(A),
1867 detail::cuda_arg<cpu_value_type>(B),
1874 detail::cuda_arg<cpu_value_type>(C),
1880 else if (!row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
1882 matrix_matrix_col_row_row_prod_TA_kernel<<<grid, threads>>>
1884 detail::cuda_arg<cpu_value_type>(A),
1890 detail::cuda_arg<cpu_value_type>(B),
1897 detail::cuda_arg<cpu_value_type>(C),
1903 else if (!row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
1905 matrix_matrix_col_row_row_prod_TT_kernel<<<grid, threads>>>
1907 detail::cuda_arg<cpu_value_type>(A),
1913 detail::cuda_arg<cpu_value_type>(B),
1920 detail::cuda_arg<cpu_value_type>(C),
1928 else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && !transposed_B)
1930 matrix_matrix_row_col_col_prod_AA_kernel<<<grid, threads>>>
1932 detail::cuda_arg<cpu_value_type>(A),
1938 detail::cuda_arg<cpu_value_type>(B),
1945 detail::cuda_arg<cpu_value_type>(C),
1951 else if (row_major_C && !row_major_A && !row_major_B && !transposed_A && transposed_B)
1953 matrix_matrix_row_col_col_prod_AT_kernel<<<grid, threads>>>
1955 detail::cuda_arg<cpu_value_type>(A),
1961 detail::cuda_arg<cpu_value_type>(B),
1968 detail::cuda_arg<cpu_value_type>(C),
1974 else if (row_major_C && !row_major_A && !row_major_B && transposed_A && !transposed_B)
1976 matrix_matrix_row_col_col_prod_TA_kernel<<<grid, threads>>>
1978 detail::cuda_arg<cpu_value_type>(A),
1984 detail::cuda_arg<cpu_value_type>(B),
1991 detail::cuda_arg<cpu_value_type>(C),
1997 else if (row_major_C && !row_major_A && !row_major_B && transposed_A && transposed_B)
1999 matrix_matrix_row_col_col_prod_TT_kernel<<<grid, threads>>>
2001 detail::cuda_arg<cpu_value_type>(A),
2007 detail::cuda_arg<cpu_value_type>(B),
2014 detail::cuda_arg<cpu_value_type>(C),
2022 else if (row_major_C && !row_major_A && row_major_B && !transposed_A && !transposed_B)
2024 matrix_matrix_row_col_row_prod_AA_kernel<<<grid, threads>>>
2026 detail::cuda_arg<cpu_value_type>(A),
2032 detail::cuda_arg<cpu_value_type>(B),
2039 detail::cuda_arg<cpu_value_type>(C),
2045 else if (row_major_C && !row_major_A && row_major_B && !transposed_A && transposed_B)
2047 matrix_matrix_row_col_row_prod_AT_kernel<<<grid, threads>>>
2049 detail::cuda_arg<cpu_value_type>(A),
2055 detail::cuda_arg<cpu_value_type>(B),
2062 detail::cuda_arg<cpu_value_type>(C),
2068 else if (row_major_C && !row_major_A && row_major_B && transposed_A && !transposed_B)
2070 matrix_matrix_row_col_row_prod_TA_kernel<<<grid, threads>>>
2072 detail::cuda_arg<cpu_value_type>(A),
2078 detail::cuda_arg<cpu_value_type>(B),
2085 detail::cuda_arg<cpu_value_type>(C),
2091 else if (row_major_C && !row_major_A && row_major_B && transposed_A && transposed_B)
2093 matrix_matrix_row_col_row_prod_TT_kernel<<<grid, threads>>>
2095 detail::cuda_arg<cpu_value_type>(A),
2101 detail::cuda_arg<cpu_value_type>(B),
2108 detail::cuda_arg<cpu_value_type>(C),
2116 else if (row_major_C && row_major_A && !row_major_B && !transposed_A && !transposed_B)
2118 matrix_matrix_row_row_col_prod_AA_kernel<<<grid, threads>>>
2120 detail::cuda_arg<cpu_value_type>(A),
2126 detail::cuda_arg<cpu_value_type>(B),
2133 detail::cuda_arg<cpu_value_type>(C),
2139 else if (row_major_C && row_major_A && !row_major_B && !transposed_A && transposed_B)
2141 matrix_matrix_row_row_col_prod_AT_kernel<<<grid, threads>>>
2143 detail::cuda_arg<cpu_value_type>(A),
2149 detail::cuda_arg<cpu_value_type>(B),
2156 detail::cuda_arg<cpu_value_type>(C),
2162 else if (row_major_C && row_major_A && !row_major_B && transposed_A && !transposed_B)
2164 matrix_matrix_row_row_col_prod_TA_kernel<<<grid, threads>>>
2166 detail::cuda_arg<cpu_value_type>(A),
2172 detail::cuda_arg<cpu_value_type>(B),
2179 detail::cuda_arg<cpu_value_type>(C),
2185 else if (row_major_C && row_major_A && !row_major_B && transposed_A && transposed_B)
2187 matrix_matrix_row_row_col_prod_TT_kernel<<<grid, threads>>>
2189 detail::cuda_arg<cpu_value_type>(A),
2195 detail::cuda_arg<cpu_value_type>(B),
2202 detail::cuda_arg<cpu_value_type>(C),
2212 else if (row_major_C && row_major_A && row_major_B && !transposed_A && !transposed_B)
2214 matrix_matrix_row_row_row_prod_AA_kernel<<<grid, threads>>>
2216 detail::cuda_arg<cpu_value_type>(A),
2222 detail::cuda_arg<cpu_value_type>(B),
2229 detail::cuda_arg<cpu_value_type>(C),
2235 else if (row_major_C && row_major_A && row_major_B && !transposed_A && transposed_B)
2237 matrix_matrix_row_row_row_prod_AT_kernel<<<grid, threads>>>
2239 detail::cuda_arg<cpu_value_type>(A),
2245 detail::cuda_arg<cpu_value_type>(B),
2252 detail::cuda_arg<cpu_value_type>(C),
2258 else if (row_major_C && row_major_A && row_major_B && transposed_A && !transposed_B)
2260 matrix_matrix_row_row_row_prod_TA_kernel<<<grid, threads>>>
2262 detail::cuda_arg<cpu_value_type>(A),
2268 detail::cuda_arg<cpu_value_type>(B),
2275 detail::cuda_arg<cpu_value_type>(C),
2281 else if (row_major_C && row_major_A && row_major_B && transposed_A && transposed_B)
2283 matrix_matrix_row_row_row_prod_TT_kernel<<<grid, threads>>>
2285 detail::cuda_arg<cpu_value_type>(A),
2291 detail::cuda_arg<cpu_value_type>(B),
2298 detail::cuda_arg<cpu_value_type>(C),
2308 template<
typename MatrixT1,
typename MatrixT2,
typename MatrixT3,
typename ScalarT>
2314 std::string kernel_name)
2318 cpu_value_type cl_alpha =
static_cast<cpu_value_type
>(alpha);
2319 cpu_value_type cl_beta =
static_cast<cpu_value_type
>(beta);
2343 throw "not implemented yet";
2346 template<
typename MatrixT1,
typename MatrixT2,
typename MatrixT3,
typename ScalarT>
2347 void prod(
const MatrixT1 & A,
bool transposed_A,
2348 const MatrixT2 & B,
bool transposed_B,
2384 template<
typename NumericT,
typename ScalarT>
2416 template<
typename NumericT,
typename ScalarT>
2418 ScalarT
const & alpha,
vcl_size_t len_alpha,
bool reciprocal_alpha,
bool flip_sign_alpha,
2425 typedef NumericT value_type;
2427 unsigned int options_alpha =
detail::make_options(len_alpha, reciprocal_alpha, flip_sign_alpha);
2429 value_type temporary_alpha = 0;
2431 temporary_alpha = alpha;
2435 scaled_rank1_update_row_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2444 detail::cuda_arg<value_type>(vec1),
2449 detail::cuda_arg<value_type>(vec2),
2458 scaled_rank1_update_col_kernel<<<128, 128>>>(detail::cuda_arg<value_type>(mat1),
2467 detail::cuda_arg<value_type>(vec1),
2472 detail::cuda_arg<value_type>(vec2),
2489 template <
typename NumericT,
typename VectorType>
2497 viennacl::linalg::cuda::bidiag_pack_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2498 viennacl::linalg::cuda::detail::cuda_arg<NumericT>(dh),
2499 viennacl::linalg::cuda::detail::cuda_arg<NumericT>(sh),
2506 viennacl::linalg::cuda::bidiag_pack_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2507 viennacl::linalg::cuda::detail::cuda_arg<NumericT>(dh),
2508 viennacl::linalg::cuda::detail::cuda_arg<NumericT>(sh),
2526 template <
typename NumericT>
2538 copy_col_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2539 detail::cuda_arg<NumericT>(V),
2540 static_cast<unsigned int>(row_start),
2541 static_cast<unsigned int>(col_start),
2547 copy_col_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2548 detail::cuda_arg<NumericT>(V),
2549 static_cast<unsigned int>(row_start),
2550 static_cast<unsigned int>(col_start),
2561 copy_row_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2562 detail::cuda_arg<NumericT>(V),
2563 static_cast<unsigned int>(row_start),
2564 static_cast<unsigned int>(col_start),
2570 copy_row_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2571 detail::cuda_arg<NumericT>(V),
2572 static_cast<unsigned int>(row_start),
2573 static_cast<unsigned int>(col_start),
2587 template <
typename NumericT>
2594 house_update_A_left_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2595 detail::cuda_arg<NumericT>(D),
2596 static_cast<unsigned int>(start + 1),
2597 static_cast<unsigned int>(start),
2606 house_update_A_left_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2607 detail::cuda_arg<NumericT>(D),
2608 static_cast<unsigned int>(start + 1),
2609 static_cast<unsigned int>(start),
2626 template <
typename NumericT>
2632 house_update_A_right_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2633 detail::cuda_arg<NumericT>(D),
2634 static_cast<unsigned int>(0),
2635 static_cast<unsigned int>(0),
2644 house_update_A_right_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(A),
2645 detail::cuda_arg<NumericT>(D),
2646 static_cast<unsigned int>(0),
2647 static_cast<unsigned int>(0),
2663 template <
typename NumericT>
2671 house_update_QL_row_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(Q),
2672 detail::cuda_arg<NumericT>(D),
2673 static_cast<unsigned int>(A_size1),
2678 house_update_QL_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(Q),
2679 detail::cuda_arg<NumericT>(D),
2680 static_cast<unsigned int>(A_size1),
2694 template<
typename NumericT>
2703 detail::cuda_arg<NumericT>(tmp1),
2704 detail::cuda_arg<NumericT>(tmp2),
2707 static_cast<unsigned int>(l),
2708 static_cast<unsigned int>(m - 1));
2711 givens_next_column_major_kernel<<<128, 128>>>(detail::cuda_arg<NumericT>(Q),
2712 detail::cuda_arg<NumericT>(tmp1),
2713 detail::cuda_arg<NumericT>(tmp2),
2716 static_cast<unsigned int>(l),
2717 static_cast<unsigned int>(m - 1));
2721 #define VIENNACL_SECTION_SIZE 512
2729 template<
typename NumericT>
2738 detail::cuda_arg<NumericT>(vec1),
2743 detail::cuda_arg<NumericT>(vec2),
2747 detail::cuda_arg<NumericT>(S),
2752 detail::cuda_arg<NumericT>(S_ref),
2756 detail::cuda_arg<NumericT>(S),
2762 detail::cuda_arg<NumericT>(S_ref),
2766 detail::cuda_arg<NumericT>(S),
2771 detail::cuda_arg<NumericT>(S),
2775 detail::cuda_arg<NumericT>(vec2),
2787 template<
typename NumericT,
typename F>
2795 detail::cuda_arg<NumericT>(vec1),
2800 detail::cuda_arg<NumericT>(vec2),
2804 detail::cuda_arg<NumericT>(S),
2809 detail::cuda_arg<NumericT>(S_ref),
2813 detail::cuda_arg<NumericT>(S),
2819 detail::cuda_arg<NumericT>(S_ref),
2823 detail::cuda_arg<NumericT>(S),
2828 detail::cuda_arg<NumericT>(S),
2832 detail::cuda_arg<NumericT>(vec2),
2838 #undef VIENNACL_SECTION_SIZE
void house_update_QL(matrix_base< NumericT > &Q, vector_base< NumericT > &D, vcl_size_t A_size1)
This function updates the matrix Q, which is needed for the computation of the eigenvectors.
unsigned int make_options(vcl_size_t length, bool reciprocal, bool flip_sign)
void house_update_A_right(matrix_base< NumericT > &A, vector_base< NumericT > &D)
This function applies a householder transformation to a matrix: A <- A * P with a householder reflect...
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Generic size and resize functionality for different vector and matrix types.
void trans(matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > const &proxy, matrix_base< NumericT > &temp_trans)
__global__ void scan_kernel_2(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS, unsigned int InputSize)
Extracts the underlying OpenCL start index handle from a vector, a matrix, an expression etc...
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Implementations of row-major dense matrix related operations, including matrix-vector products...
size_type stride2() const
Returns the number of columns.
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
void clear(VectorType &vec)
Generic routine for setting all entries of a vector to zero. This is the version for non-ViennaCL obj...
This file provides the forward declarations for the main types used within ViennaCL.
result_of::size_type< T >::type start1(T const &obj)
void ambm(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
Determines row and column increments for matrices and matrix proxies.
Implementations of column-major dense matrix related operations, including matrix-vector products...
viennacl::scalar< int > s2
viennacl::scalar< float > s1
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
__global__ void scan_kernel_4(T *S, unsigned int startS, unsigned int incS, T *Y, unsigned int startY, unsigned int incY, unsigned int OutputSize)
void prod_slow_kernel(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
#define VIENNACL_SECTION_SIZE
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
result_of::size_type< T >::type start2(T const &obj)
Helper struct for checking whether a type is a host scalar type (e.g. float, double) ...
void prod_fast_kernel(const MatrixT1 &A, const MatrixT2 &B, MatrixT3 &C, ScalarT alpha, ScalarT beta, std::string kernel_name)
void am(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha)
void matrix_diag_to_vector(matrix_base< NumericT > const &mat, int k, vector_base< NumericT > &vec)
void house_update_A_left(matrix_base< NumericT > &A, vector_base< NumericT > &D, vcl_size_t start)
This function applies a householder transformation to a matrix. A <- P * A with a householder reflect...
result_of::size_type< T >::type start(T const &obj)
void inclusive_scan(vector_base< NumericT > &vec1, vector_base< NumericT > &vec2)
This function implements an inclusive scan.
void scaled_rank_1_update(matrix_base< NumericT > &mat1, ScalarT const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, const vector_base< NumericT > &vec1, const vector_base< NumericT > &vec2)
The implementation of the operation mat += alpha * vec1 * vec2^T, i.e. a scaled rank 1 update...
void ambm_m(matrix_base< NumericT > &mat1, matrix_base< NumericT > const &mat2, ScalarT1 const &alpha, vcl_size_t len_alpha, bool reciprocal_alpha, bool flip_sign_alpha, matrix_base< NumericT > const &mat3, ScalarT2 const &beta, vcl_size_t len_beta, bool reciprocal_beta, bool flip_sign_beta)
size_type stride1() const
Returns the number of rows.
void matrix_diag_from_vector(const vector_base< NumericT > &vec, int k, matrix_base< NumericT > &mat)
void matrix_diagonal_assign(matrix_base< NumericT > &mat, NumericT s)
Dense matrix-matrix product CUDA kernels reside here.
__global__ void inclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
void prod(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
Helper metafunction for checking whether the provided type is viennacl::op_div (for division) ...
T::ERROR_CANNOT_DEDUCE_CPU_SCALAR_TYPE_FOR_T type
Proxy classes for vectors.
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
__global__ void scan_kernel_3(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS)
All the predicates used within ViennaCL. Checks for expressions to be vectors, etc.
void matrix_column(const matrix_base< NumericT > &mat, unsigned int j, vector_base< NumericT > &vec)
void element_op(matrix_base< NumericT, SizeT > &A, matrix_expression< const matrix_base< NumericT, SizeT >, const matrix_base< NumericT, SizeT >, op_element_binary< OpT > > const &proxy)
Common routines for CUDA execution.
void matrix_row(matrix_base< NumericT > const &mat, unsigned int i, vector_base< NumericT > &vec)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
size_type size() const
Returns the length of the vector (cf. std::vector)
void bidiag_pack(matrix_base< NumericT > &A, VectorType &dh, VectorType &sh)
This function stores the diagonal and the superdiagonal of a matrix in two vectors.
A tag class representing transposed matrices.
size_type start2() const
Returns the number of columns.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
A tag class representing element-wise binary operations (like multiplication) on vectors or matrices...
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
void givens_next(matrix_base< NumericT > &Q, vector_base< NumericT > &tmp1, vector_base< NumericT > &tmp2, int l, int m)
This function updates the matrix Q. It is part of the tql2 algorithm.
Extracts the underlying OpenCL handle from a vector, a matrix, an expression etc. ...
viennacl::backend::mem_handle & handle(T &obj)
Returns the generic memory handle of an object. Non-const version.
Helper metafunction for checking whether the provided type is viennacl::op_prod (for products/multipl...
A tag class representing element-wise unary operations (like sin()) on vectors or matrices...
Implementation of the ViennaCL scalar class.
Implementations of NMF operations using CUDA.
A collection of compile time type deductions.
__global__ void exclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
void matrix_assign(matrix_base< NumericT > &mat, NumericT s, bool clear=false)
viennacl::backend::mem_handle::cuda_handle_type & arg_reference(viennacl::scalar< NumericT > &s, OtherT)
void copy_vec(matrix_base< NumericT > &A, vector_base< NumericT > &V, vcl_size_t row_start, vcl_size_t col_start, bool copy_col)
This function copies a row or a column from a matrix to a vector.
Simple enable-if variant that uses the SFINAE pattern.
size_type start1() const
Returns the number of rows.
void exclusive_scan(vector_base< NumericT, F > &vec1, vector_base< NumericT, F > &vec2)
This function implements an exclusive scan.