ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/cuda/sparse_matrix_operations.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
00002 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00025 #include "viennacl/forwards.h"
00026 #include "viennacl/scalar.hpp"
00027 #include "viennacl/vector.hpp"
00028 #include "viennacl/tools/tools.hpp"
00029 #include "viennacl/linalg/cuda/common.hpp"
00030 
00031 #include "viennacl/linalg/cuda/sparse_matrix_operations_solve.hpp"
00032 
00033 namespace viennacl
00034 {
00035   namespace linalg
00036   {
00037     namespace cuda
00038     {
00039       //
00040       // Compressed matrix
00041       //
00042 
00043       namespace detail
00044       {
00045 
00046         template <typename T>
00047         __global__ void csr_row_info_extractor_kernel(
00048                   const unsigned int * row_indices,
00049                   const unsigned int * column_indices,
00050                   const T * elements,
00051                   T * result,
00052                   unsigned int size,
00053                   unsigned int option)
00054         {
00055           for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
00056                             row  < size;
00057                             row += gridDim.x * blockDim.x)
00058           {
00059             T value = 0;
00060             unsigned int row_end = row_indices[row+1];
00061 
00062             switch (option)
00063             {
00064               case 0: //inf-norm
00065                 for (unsigned int i = row_indices[row]; i < row_end; ++i)
00066                   value = max(value, fabs(elements[i]));
00067                 break;
00068 
00069               case 1: //1-norm
00070                 for (unsigned int i = row_indices[row]; i < row_end; ++i)
00071                   value += fabs(elements[i]);
00072                 break;
00073 
00074               case 2: //2-norm
00075                 for (unsigned int i = row_indices[row]; i < row_end; ++i)
00076                   value += elements[i] * elements[i];
00077                 value = sqrt(value);
00078                 break;
00079 
00080               case 3: //diagonal entry
00081                 for (unsigned int i = row_indices[row]; i < row_end; ++i)
00082                 {
00083                   if (column_indices[i] == row)
00084                   {
00085                     value = elements[i];
00086                     break;
00087                   }
00088                 }
00089                 break;
00090 
00091               default:
00092                 break;
00093             }
00094             result[row] = value;
00095           }
00096         }
00097 
00098 
00099         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00100         void row_info(compressed_matrix<ScalarType, MAT_ALIGNMENT> const & mat,
00101                       vector_base<ScalarType> & vec,
00102                       viennacl::linalg::detail::row_info_types info_selector)
00103         {
00104           csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
00105                                                       detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
00106                                                       detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00107                                                       detail::cuda_arg<ScalarType>(vec),
00108                                                       static_cast<unsigned int>(mat.size1()),
00109                                                       static_cast<unsigned int>(info_selector)
00110                                                      );
00111           VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
00112         }
00113 
00114       } //namespace detail
00115 
00116 
00117       template <typename T>
00118       __global__ void compressed_matrix_vec_mul_kernel(
00119                 const unsigned int * row_indices,
00120                 const unsigned int * column_indices,
00121                 const T * elements,
00122                 const T * x,
00123                 unsigned int start_x,
00124                 unsigned int inc_x,
00125                 T * result,
00126                 unsigned int start_result,
00127                 unsigned int inc_result,
00128                 unsigned int size_result)
00129       {
00130         for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
00131                           row  < size_result;
00132                           row += gridDim.x * blockDim.x)
00133         {
00134           T dot_prod = (T)0;
00135           unsigned int row_end = row_indices[row+1];
00136           for (unsigned int i = row_indices[row]; i < row_end; ++i)
00137             dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
00138           result[row * inc_result + start_result] = dot_prod;
00139         }
00140       }
00141 
00142 
00143 
00144 
00145 
00154       template<class ScalarType, unsigned int ALIGNMENT>
00155       void prod_impl(const viennacl::compressed_matrix<ScalarType, ALIGNMENT> & mat,
00156                      const viennacl::vector_base<ScalarType> & vec,
00157                            viennacl::vector_base<ScalarType> & result)
00158       {
00159         compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
00160                                                        detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
00161                                                        detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00162                                                        detail::cuda_arg<ScalarType>(vec),
00163                                                        static_cast<unsigned int>(vec.start()),
00164                                                        static_cast<unsigned int>(vec.stride()),
00165                                                        detail::cuda_arg<ScalarType>(result),
00166                                                        static_cast<unsigned int>(result.start()),
00167                                                        static_cast<unsigned int>(result.stride()),
00168                                                        static_cast<unsigned int>(result.size())
00169                                                       );
00170         VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_kernel");
00171       }
00172 
00177       template <typename LayoutT>
00178       struct mat_mult_matrix_index
00179       {
00180         static __device__ unsigned int apply(unsigned int i, unsigned int j,
00181                                       unsigned int row_start, unsigned int row_inc,
00182                                       unsigned int col_start, unsigned int col_inc,
00183                                       unsigned int internal_rows, unsigned int internal_cols)
00184         {
00185           return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
00186         }
00187       };
00188 
00190       template <>
00191       struct mat_mult_matrix_index<viennacl::column_major>
00192       {
00193         static __device__ unsigned int apply(unsigned int i, unsigned int j,
00194                                       unsigned int row_start, unsigned int row_inc,
00195                                       unsigned int col_start, unsigned int col_inc,
00196                                       unsigned int internal_rows, unsigned int internal_cols)
00197         {
00198           return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
00199         }
00200       };
00204       template <typename DMatIndexT, typename ResultIndexT, typename T>
00205       __global__ void compressed_matrix_d_mat_mul_kernel(
00206                 const unsigned int * sp_mat_row_indices,
00207                 const unsigned int * sp_mat_col_indices,
00208                 const T * sp_mat_elements,
00209                 const T * d_mat,
00210                 unsigned int d_mat_row_start,
00211                 unsigned int d_mat_col_start,
00212                 unsigned int d_mat_row_inc,
00213                 unsigned int d_mat_col_inc,
00214                 unsigned int d_mat_row_size,
00215                 unsigned int d_mat_col_size,
00216                 unsigned int d_mat_internal_rows,
00217                 unsigned int d_mat_internal_cols,
00218                 T * result,
00219                 unsigned int result_row_start,
00220                 unsigned int result_col_start,
00221                 unsigned int result_row_inc,
00222                 unsigned int result_col_inc,
00223                 unsigned int result_row_size,
00224                 unsigned int result_col_size,
00225                 unsigned int result_internal_rows,
00226                 unsigned int result_internal_cols) {
00227 
00228         for (unsigned int row  = blockIdx.x; row  < result_row_size; row += gridDim.x) {
00229 
00230           unsigned int row_start = sp_mat_row_indices[row];
00231           unsigned int row_end = sp_mat_row_indices[row+1];
00232 
00233           for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
00234 
00235             T r = 0;
00236 
00237             for (unsigned int k = row_start; k < row_end; k++) {
00238 
00239               unsigned int j = sp_mat_col_indices[k];
00240               T x = sp_mat_elements[k];
00241               T y = d_mat[ DMatIndexT::apply(j, col,
00242                                              d_mat_row_start, d_mat_row_inc,
00243                                              d_mat_col_start, d_mat_col_inc,
00244                                              d_mat_internal_rows, d_mat_internal_cols) ];
00245 
00246               r += x * y;
00247             }
00248 
00249             result [ ResultIndexT::apply(row, col,
00250                                         result_row_start, result_row_inc,
00251                                         result_col_start, result_col_inc,
00252                                         result_internal_rows, result_internal_cols) ] = r;
00253           }
00254 
00255         }
00256 
00257       }
00258 
00259 
00268       template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
00269       void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
00270                      const viennacl::matrix_base<TYPE, F1> & d_mat,
00271                            viennacl::matrix_base<TYPE, F2> & result) {
00272         compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
00273                                                       (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
00274                                                        detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
00275                                                        detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()),
00276 
00277                                                        detail::cuda_arg<TYPE>(d_mat),
00278                                                        static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
00279                                                        static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
00280                                                        static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
00281                                                        static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
00282 
00283                                                        detail::cuda_arg<TYPE>(result),
00284                                                        static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
00285                                                        static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
00286                                                        static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
00287                                                        static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
00288                                                       );
00289         VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
00290       }
00291 
00292 
00293       template <typename DMatIndexT, typename ResultIndexT, typename T>
00294       __global__ void compressed_matrix_d_tr_mat_mul_kernel(
00295                 const unsigned int * sp_mat_row_indices,
00296                 const unsigned int * sp_mat_col_indices,
00297                 const T * sp_mat_elements,
00298                 const T * d_mat,
00299                 unsigned int d_mat_row_start,
00300                 unsigned int d_mat_col_start,
00301                 unsigned int d_mat_row_inc,
00302                 unsigned int d_mat_col_inc,
00303                 unsigned int d_mat_row_size,
00304                 unsigned int d_mat_col_size,
00305                 unsigned int d_mat_internal_rows,
00306                 unsigned int d_mat_internal_cols,
00307                 T * result,
00308                 unsigned int result_row_start,
00309                 unsigned int result_col_start,
00310                 unsigned int result_row_inc,
00311                 unsigned int result_col_inc,
00312                 unsigned int result_row_size,
00313                 unsigned int result_col_size,
00314                 unsigned int result_internal_rows,
00315                 unsigned int result_internal_cols) {
00316 
00317         for (unsigned int row  = blockIdx.x; row  < result_row_size; row += gridDim.x) {
00318 
00319           unsigned int row_start = sp_mat_row_indices[row];
00320           unsigned int row_end = sp_mat_row_indices[row+1];
00321 
00322           for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x) {
00323 
00324             T r = 0;
00325 
00326             for (unsigned int k = row_start; k < row_end; k++) {
00327 
00328               unsigned int j = sp_mat_col_indices[k];
00329               T x = sp_mat_elements[k];
00330               T y = d_mat[ DMatIndexT::apply(col, j,
00331                                              d_mat_row_start, d_mat_row_inc,
00332                                              d_mat_col_start, d_mat_col_inc,
00333                                              d_mat_internal_rows, d_mat_internal_cols) ];
00334 
00335               r += x * y;
00336             }
00337 
00338             result [ ResultIndexT::apply(row, col,
00339                                          result_row_start, result_row_inc,
00340                                          result_col_start, result_col_inc,
00341                                          result_internal_rows, result_internal_cols) ] = r;
00342           }
00343         }
00344 
00345       }
00346 
00356       template< typename TYPE, unsigned int ALIGNMENT, typename F1, typename F2>
00357       void prod_impl(const viennacl::compressed_matrix<TYPE, ALIGNMENT> & sp_mat,
00358                      const viennacl::matrix_expression< const viennacl::matrix_base<TYPE, F1>,
00359                                                         const viennacl::matrix_base<TYPE, F1>,
00360                                                         viennacl::op_trans > & d_mat,
00361                       viennacl::matrix_base<TYPE, F2> & result) {
00362 
00363         compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
00364                                                       (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
00365                                                        detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
00366                                                        detail::cuda_arg<TYPE>(sp_mat.handle().cuda_handle()),
00367 
00368                                                        detail::cuda_arg<TYPE>(d_mat.lhs()),
00369                                                        static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
00370                                                        static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
00371                                                        static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
00372                                                        static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
00373 
00374                                                        detail::cuda_arg<TYPE>(result),
00375                                                        static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
00376                                                        static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
00377                                                        static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
00378                                                        static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
00379                                                       );
00380         VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
00381       }
00382 
00383 
00384       //
00385       // triangular solves for compressed_matrix
00386       //
00387 
00388       template <typename T>
00389       __global__ void compressed_matrix_diagonal_kernel(
00390                 const unsigned int * row_indices,
00391                 const unsigned int * column_indices,
00392                 const T * elements,
00393                 T * result,
00394                 unsigned int size)
00395       {
00396         for (unsigned int row  = blockDim.x * blockIdx.x + threadIdx.x;
00397                           row  < size;
00398                           row += gridDim.x * blockDim.x)
00399         {
00400           T diag = (T)0;
00401           unsigned int row_end = row_indices[row+1];
00402           for (unsigned int i = row_indices[row]; i < row_end; ++i)
00403           {
00404             unsigned int col_index = column_indices[i];
00405             if (col_index == row)
00406             {
00407               diag = elements[i];
00408               break;
00409             }
00410           }
00411           result[row] = diag;
00412         }
00413       }
00414 
00415 
00421       template<typename SparseMatrixType, class ScalarType>
00422       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00423       inplace_solve(const SparseMatrixType & mat,
00424                     viennacl::vector_base<ScalarType> & vec,
00425                     viennacl::linalg::unit_lower_tag)
00426       {
00427         csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
00428                                           detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
00429                                           detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00430                                           detail::cuda_arg<ScalarType>(vec),
00431                                           static_cast<unsigned int>(mat.size1())
00432                                          );
00433         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
00434       }
00435 
00436 
00442       template<typename SparseMatrixType, class ScalarType>
00443       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00444       inplace_solve(const SparseMatrixType & mat,
00445                     viennacl::vector_base<ScalarType> & vec,
00446                     viennacl::linalg::lower_tag)
00447       {
00448         csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
00449                                           detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
00450                                           detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00451                                           detail::cuda_arg<ScalarType>(vec),
00452                                           static_cast<unsigned int>(mat.size1())
00453                                          );
00454         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
00455       }
00456 
00457 
00458 
00464       template<typename SparseMatrixType, class ScalarType>
00465       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00466       inplace_solve(const SparseMatrixType & mat,
00467                     viennacl::vector_base<ScalarType> & vec,
00468                     viennacl::linalg::unit_upper_tag)
00469       {
00470         csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
00471                                           detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
00472                                           detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00473                                           detail::cuda_arg<ScalarType>(vec),
00474                                           static_cast<unsigned int>(mat.size1())
00475                                          );
00476         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
00477       }
00478 
00479 
00485       template<typename SparseMatrixType, class ScalarType>
00486       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00487       inplace_solve(const SparseMatrixType & mat,
00488                     viennacl::vector_base<ScalarType> & vec,
00489                     viennacl::linalg::upper_tag)
00490       {
00491         csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
00492                                           detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
00493                                           detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00494                                           detail::cuda_arg<ScalarType>(vec),
00495                                           static_cast<unsigned int>(mat.size1())
00496                                          );
00497         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
00498       }
00499 
00500 
00501 
00502       // transposed
00503 
00509       template<typename SparseMatrixType, class ScalarType>
00510       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00511       inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
00512                     viennacl::vector_base<ScalarType> & vec,
00513                     viennacl::linalg::unit_lower_tag)
00514       {
00515         csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
00516                                                 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
00517                                                 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
00518                                                 detail::cuda_arg<ScalarType>(vec),
00519                                                 static_cast<unsigned int>(mat.lhs().size1())
00520                                                );
00521         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
00522       }
00523 
00524 
00530       template<typename SparseMatrixType, class ScalarType>
00531       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00532       inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
00533                     viennacl::vector_base<ScalarType> & vec,
00534                     viennacl::linalg::lower_tag)
00535       {
00536         viennacl::vector<ScalarType> diagonal(vec.size());
00537 
00538         compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
00539                                                       detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
00540                                                       detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
00541                                                       detail::cuda_arg<ScalarType>(diagonal),
00542                                                       static_cast<unsigned int>(mat.size1())
00543                                                      );
00544 
00545         csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
00546                                                 detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
00547                                                 detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
00548                                                 detail::cuda_arg<ScalarType>(diagonal),
00549                                                 detail::cuda_arg<ScalarType>(vec),
00550                                                 static_cast<unsigned int>(mat.lhs().size1())
00551                                                );
00552         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
00553       }
00554 
00555 
00561       template<typename SparseMatrixType, class ScalarType>
00562       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00563       inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
00564                     viennacl::vector_base<ScalarType> & vec,
00565                     viennacl::linalg::unit_upper_tag)
00566       {
00567         csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
00568                                                       detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
00569                                                       detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
00570                                                       detail::cuda_arg<ScalarType>(vec),
00571                                                       static_cast<unsigned int>(mat.lhs().size1())
00572                                                     );
00573         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
00574       }
00575 
00576 
00582       template<typename SparseMatrixType, class ScalarType>
00583       typename viennacl::enable_if< viennacl::is_any_sparse_matrix<SparseMatrixType>::value>::type
00584       inplace_solve(const matrix_expression<const SparseMatrixType, const SparseMatrixType, op_trans> & mat,
00585                     viennacl::vector_base<ScalarType> & vec,
00586                     viennacl::linalg::upper_tag)
00587       {
00588         viennacl::vector<ScalarType> diagonal(vec.size());
00589 
00590         compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
00591                                                       detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
00592                                                       detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
00593                                                       detail::cuda_arg<ScalarType>(diagonal),
00594                                                       static_cast<unsigned int>(mat.size1())
00595                                                      );
00596 
00597         csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
00598                                                  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
00599                                                  detail::cuda_arg<ScalarType>(mat.lhs().handle().cuda_handle()),
00600                                                  detail::cuda_arg<ScalarType>(diagonal),
00601                                                  detail::cuda_arg<ScalarType>(vec),
00602                                                  static_cast<unsigned int>(mat.lhs().size1())
00603                                                 );
00604         VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
00605       }
00606 
00607       namespace detail
00608       {
00609         //
00610         // block solves
00611         //
00612         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00613         void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00614                                                          const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00615                                                          op_trans> & L,
00616                                  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
00617                                  vector_base<ScalarType> const & /* L_diagonal */,  //ignored
00618                                  vector_base<ScalarType> & vec,
00619                                  viennacl::linalg::unit_lower_tag)
00620         {
00621           csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
00622                                                                detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
00623                                                                detail::cuda_arg<ScalarType>(L.lhs().handle().cuda_handle()),
00624                                                                detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
00625                                                                detail::cuda_arg<ScalarType>(vec),
00626                                                                static_cast<unsigned int>(L.lhs().size1())
00627                                                               );
00628         }
00629 
00630 
00631         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00632         void block_inplace_solve(const matrix_expression<const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00633                                                          const compressed_matrix<ScalarType, MAT_ALIGNMENT>,
00634                                                          op_trans> & U,
00635                                  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
00636                                  vector_base<ScalarType> const & U_diagonal,
00637                                  vector_base<ScalarType> & vec,
00638                                  viennacl::linalg::upper_tag)
00639         {
00640           csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
00641                                                            detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
00642                                                            detail::cuda_arg<ScalarType>(U.lhs().handle().cuda_handle()),
00643                                                            detail::cuda_arg<ScalarType>(U_diagonal.handle().cuda_handle()),
00644                                                            detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
00645                                                            detail::cuda_arg<ScalarType>(vec),
00646                                                            static_cast<unsigned int>(U.lhs().size1())
00647                                                           );
00648         }
00649 
00650 
00651       }
00652 
00653 
00654       //
00655       // Compressed Compressed Matrix
00656       //
00657 
00658       template <typename T>
00659       __global__ void compressed_compressed_matrix_vec_mul_kernel(
00660                 const unsigned int * row_jumper,
00661                 const unsigned int * row_indices,
00662                 const unsigned int * column_indices,
00663                 const T * elements,
00664                 unsigned int nonzero_rows,
00665                 const T * x,
00666                 unsigned int start_x,
00667                 unsigned int inc_x,
00668                 T * result,
00669                 unsigned int start_result,
00670                 unsigned int inc_result,
00671                 unsigned int size_result)
00672       {
00673         for (unsigned int i  = blockDim.x * blockIdx.x + threadIdx.x;
00674                           i  < size_result;
00675                           i += gridDim.x * blockDim.x)
00676         {
00677           result[i * inc_result + start_result] = 0;
00678         }
00679 
00680         for (unsigned int i  = blockDim.x * blockIdx.x + threadIdx.x;
00681                           i  < nonzero_rows;
00682                           i += gridDim.x * blockDim.x)
00683         {
00684           T dot_prod = (T)0;
00685           unsigned int row_end = row_jumper[i+1];
00686           for (unsigned int j = row_jumper[i]; j < row_end; ++j)
00687             dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
00688           result[row_indices[i] * inc_result + start_result] = dot_prod;
00689         }
00690       }
00691 
00692 
00701       template<class ScalarType>
00702       void prod_impl(const viennacl::compressed_compressed_matrix<ScalarType> & mat,
00703                      const viennacl::vector_base<ScalarType> & vec,
00704                            viennacl::vector_base<ScalarType> & result)
00705       {
00706         compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
00707                                                                   detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
00708                                                                   detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
00709                                                                   detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00710                                                                   static_cast<unsigned int>(mat.nnz1()),
00711                                                                   detail::cuda_arg<ScalarType>(vec),
00712                                                                   static_cast<unsigned int>(vec.start()),
00713                                                                   static_cast<unsigned int>(vec.stride()),
00714                                                                   detail::cuda_arg<ScalarType>(result),
00715                                                                   static_cast<unsigned int>(result.start()),
00716                                                                   static_cast<unsigned int>(result.stride()),
00717                                                                   static_cast<unsigned int>(result.size())
00718                                                                  );
00719         VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
00720       }
00721 
00722       //
00723       // Coordinate Matrix
00724       //
00725 
00726 
00727       namespace detail
00728       {
00729 
00730         template <typename T>
00731         __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
00732                                                 const T * elements,
00733                                                 const unsigned int * group_boundaries,
00734                                                 T * result,
00735                                                 unsigned int option)
00736         {
00737           __shared__ unsigned int shared_rows[128];
00738           __shared__ T inter_results[128];
00739 
00740           uint2 tmp;
00741           T val;
00742           unsigned int last_index  = blockDim.x - 1;
00743           unsigned int group_start = group_boundaries[blockIdx.x];
00744           unsigned int group_end   = group_boundaries[blockIdx.x + 1];
00745           unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
00746 
00747           unsigned int local_index = 0;
00748 
00749           for (unsigned int k = 0; k < k_end; ++k)
00750           {
00751             local_index = group_start + k * blockDim.x + threadIdx.x;
00752 
00753             tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
00754             val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
00755 
00756             //check for carry from previous loop run:
00757             if (threadIdx.x == 0 && k > 0)
00758             {
00759               if (tmp.x == shared_rows[last_index])
00760               {
00761                 switch (option)
00762                 {
00763                   case 0: //inf-norm
00764                   case 3: //diagonal entry
00765                     val = max(val, fabs(inter_results[last_index]));
00766                     break;
00767 
00768                   case 1: //1-norm
00769                     val = fabs(val) + inter_results[last_index];
00770                     break;
00771 
00772                   case 2: //2-norm
00773                     val = sqrt(val * val + inter_results[last_index]);
00774                     break;
00775 
00776                   default:
00777                     break;
00778                 }
00779               }
00780               else
00781               {
00782                 switch (option)
00783                 {
00784                   case 0: //inf-norm
00785                   case 1: //1-norm
00786                   case 3: //diagonal entry
00787                     result[shared_rows[last_index]] = inter_results[last_index];
00788                     break;
00789 
00790                   case 2: //2-norm
00791                     result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
00792                   default:
00793                     break;
00794                 }
00795               }
00796             }
00797 
00798             //segmented parallel reduction begin
00799             __syncthreads();
00800             shared_rows[threadIdx.x] = tmp.x;
00801             switch (option)
00802             {
00803               case 0:
00804               case 3:
00805                 inter_results[threadIdx.x] = val;
00806                 break;
00807               case 1:
00808                 inter_results[threadIdx.x] = fabs(val);
00809                 break;
00810               case 2:
00811                 inter_results[threadIdx.x] = val * val;
00812               default:
00813                 break;
00814             }
00815             T left = 0;
00816             __syncthreads();
00817 
00818             for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
00819             {
00820               left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
00821               __syncthreads();
00822               switch (option)
00823               {
00824                 case 0: //inf-norm
00825                 case 3: //diagonal entry
00826                   inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
00827                   break;
00828 
00829                 case 1: //1-norm
00830                   inter_results[threadIdx.x] += left;
00831                   break;
00832 
00833                 case 2: //2-norm
00834                   inter_results[threadIdx.x] += left;
00835                   break;
00836 
00837                 default:
00838                   break;
00839               }
00840               __syncthreads();
00841             }
00842             //segmented parallel reduction end
00843 
00844             if (threadIdx.x != last_index &&
00845                 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
00846                 inter_results[threadIdx.x] != 0)
00847             {
00848               result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
00849             }
00850 
00851             __syncthreads();
00852           } //for k
00853 
00854           if (threadIdx.x == last_index && inter_results[last_index] != 0)
00855             result[tmp.x] = (option == 2) ? sqrt(inter_results[last_index]) : inter_results[last_index];
00856         }
00857 
00858         template<typename ScalarType, unsigned int MAT_ALIGNMENT>
00859         void row_info(coordinate_matrix<ScalarType, MAT_ALIGNMENT> const & mat,
00860                       vector_base<ScalarType> & vec,
00861                       viennacl::linalg::detail::row_info_types info_selector)
00862         {
00863           coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
00864                                                detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00865                                                detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
00866                                                detail::cuda_arg<ScalarType>(vec),
00867                                                static_cast<unsigned int>(info_selector)
00868                                               );
00869           VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
00870         }
00871 
00872       } //namespace detail
00873 
00874 
00875       template <typename T>
00876       __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
00877                                                        const T * elements,
00878                                                        const unsigned int * group_boundaries,
00879                                                        const T * x,
00880                                                        unsigned int start_x,
00881                                                        unsigned int inc_x,
00882                                                              T * result,
00883                                                        unsigned int start_result,
00884                                                        unsigned int inc_result
00885                                                        )
00886       {
00887         __shared__ unsigned int shared_rows[128];
00888         __shared__ T inter_results[128];
00889 
00890         uint2 tmp;
00891         T val;
00892         unsigned int group_start = group_boundaries[blockIdx.x];
00893         unsigned int group_end   = group_boundaries[blockIdx.x + 1];
00894         unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
00895 
00896         unsigned int local_index = 0;
00897 
00898         for (unsigned int k = 0; k < k_end; ++k)
00899         {
00900           local_index = group_start + k * blockDim.x + threadIdx.x;
00901 
00902           tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
00903           val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
00904 
00905           //check for carry from previous loop run:
00906           if (threadIdx.x == 0 && k > 0)
00907           {
00908             if (tmp.x == shared_rows[blockDim.x-1])
00909               val += inter_results[blockDim.x-1];
00910             else
00911               result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
00912           }
00913 
00914           //segmented parallel reduction begin
00915           __syncthreads();
00916           shared_rows[threadIdx.x] = tmp.x;
00917           inter_results[threadIdx.x] = val;
00918           T left = 0;
00919           __syncthreads();
00920 
00921           for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
00922           {
00923             left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
00924             __syncthreads();
00925             inter_results[threadIdx.x] += left;
00926             __syncthreads();
00927           }
00928           //segmented parallel reduction end
00929 
00930           if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
00931               shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
00932           {
00933             result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
00934           }
00935 
00936           __syncthreads();
00937         } //for k
00938 
00939         if (local_index + 1 == group_end)
00940           result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
00941       }
00942 
00943 
00952       template<class ScalarType, unsigned int ALIGNMENT>
00953       void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & mat,
00954                      const viennacl::vector_base<ScalarType> & vec,
00955                            viennacl::vector_base<ScalarType> & result)
00956       {
00957         result.clear();
00958 
00959         coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
00960                                                       detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
00961                                                       detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
00962                                                       detail::cuda_arg<ScalarType>(vec),
00963                                                       static_cast<unsigned int>(vec.start()),
00964                                                       static_cast<unsigned int>(vec.stride()),
00965                                                       detail::cuda_arg<ScalarType>(result),
00966                                                       static_cast<unsigned int>(result.start()),
00967                                                       static_cast<unsigned int>(result.stride())
00968                                                      );
00969         VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
00970       }
00971 
00972 
00973 
00974 
00975       template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT>
00976       __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
00977                                                          const ScalarType * elements,
00978                                                          const unsigned int * group_boundaries,
00979                                                          const NumericT * d_mat,
00980                                                          unsigned int d_mat_row_start,
00981                                                          unsigned int d_mat_col_start,
00982                                                          unsigned int d_mat_row_inc,
00983                                                          unsigned int d_mat_col_inc,
00984                                                          unsigned int d_mat_row_size,
00985                                                          unsigned int d_mat_col_size,
00986                                                          unsigned int d_mat_internal_rows,
00987                                                          unsigned int d_mat_internal_cols,
00988                                                          NumericT * result,
00989                                                          unsigned int result_row_start,
00990                                                          unsigned int result_col_start,
00991                                                          unsigned int result_row_inc,
00992                                                          unsigned int result_col_inc,
00993                                                          unsigned int result_row_size,
00994                                                          unsigned int result_col_size,
00995                                                          unsigned int result_internal_rows,
00996                                                          unsigned int result_internal_cols)
00997       {
00998         __shared__ unsigned int shared_rows[128];
00999         __shared__ NumericT inter_results[128];
01000 
01001         uint2 tmp;
01002         NumericT val;
01003         unsigned int group_start = group_boundaries[blockIdx.x];
01004         unsigned int group_end   = group_boundaries[blockIdx.x + 1];
01005         unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
01006 
01007         unsigned int local_index = 0;
01008 
01009         for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
01010         {
01011           for (unsigned int k = 0; k < k_end; ++k)
01012           {
01013             local_index = group_start + k * blockDim.x + threadIdx.x;
01014 
01015             tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
01016             val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
01017                                                                                               d_mat_row_start, d_mat_row_inc,
01018                                                                                               d_mat_col_start, d_mat_col_inc,
01019                                                                                               d_mat_internal_rows, d_mat_internal_cols) ] : 0;
01020 
01021             //check for carry from previous loop run:
01022             if (threadIdx.x == 0 && k > 0)
01023             {
01024               if (tmp.x == shared_rows[blockDim.x-1])
01025                 val += inter_results[blockDim.x-1];
01026               else
01027                 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
01028                                            result_row_start, result_row_inc,
01029                                            result_col_start, result_col_inc,
01030                                            result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
01031             }
01032 
01033             //segmented parallel reduction begin
01034             __syncthreads();
01035             shared_rows[threadIdx.x] = tmp.x;
01036             inter_results[threadIdx.x] = val;
01037             NumericT left = 0;
01038             __syncthreads();
01039 
01040             for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
01041             {
01042               left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
01043               __syncthreads();
01044               inter_results[threadIdx.x] += left;
01045               __syncthreads();
01046             }
01047             //segmented parallel reduction end
01048 
01049             if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
01050                 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
01051             {
01052               result[ResultIndexT::apply(tmp.x, result_col,
01053                                          result_row_start, result_row_inc,
01054                                          result_col_start, result_col_inc,
01055                                          result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
01056             }
01057 
01058             __syncthreads();
01059           } //for k
01060 
01061           if (local_index + 1 == group_end)
01062             result[ResultIndexT::apply(tmp.x, result_col,
01063                                        result_row_start, result_row_inc,
01064                                        result_col_start, result_col_inc,
01065                                        result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
01066         }
01067       }
01068 
01069 
01078       template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
01079       void prod_impl(const viennacl::coordinate_matrix<NumericT, ALIGNMENT> & sp_mat,
01080                      const viennacl::matrix_base<NumericT, F1> & d_mat,
01081                            viennacl::matrix_base<NumericT, F2> & result) {
01082 
01083         coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>>
01084                                                         (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
01085                                                          detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
01086                                                          detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
01087 
01088                                                          detail::cuda_arg<NumericT>(d_mat),
01089                                                          static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
01090                                                          static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
01091                                                          static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
01092                                                          static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
01093 
01094                                                          detail::cuda_arg<NumericT>(result),
01095                                                          static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
01096                                                          static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
01097                                                          static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
01098                                                          static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
01099                                                          );
01100 
01101       }
01102 
01103       template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT>
01104       __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
01105                                                            const ScalarType * elements,
01106                                                            const unsigned int * group_boundaries,
01107                                                            const NumericT * d_mat,
01108                                                            unsigned int d_mat_row_start,
01109                                                            unsigned int d_mat_col_start,
01110                                                            unsigned int d_mat_row_inc,
01111                                                            unsigned int d_mat_col_inc,
01112                                                            unsigned int d_mat_row_size,
01113                                                            unsigned int d_mat_col_size,
01114                                                            unsigned int d_mat_internal_rows,
01115                                                            unsigned int d_mat_internal_cols,
01116                                                            NumericT * result,
01117                                                            unsigned int result_row_start,
01118                                                            unsigned int result_col_start,
01119                                                            unsigned int result_row_inc,
01120                                                            unsigned int result_col_inc,
01121                                                            unsigned int result_row_size,
01122                                                            unsigned int result_col_size,
01123                                                            unsigned int result_internal_rows,
01124                                                            unsigned int result_internal_cols)
01125       {
01126         __shared__ unsigned int shared_rows[128];
01127         __shared__ NumericT inter_results[128];
01128 
01129         uint2 tmp;
01130         NumericT val;
01131         unsigned int group_start = group_boundaries[blockIdx.x];
01132         unsigned int group_end   = group_boundaries[blockIdx.x + 1];
01133         unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0;   // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
01134 
01135         unsigned int local_index = 0;
01136 
01137         for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
01138         {
01139           for (unsigned int k = 0; k < k_end; ++k)
01140           {
01141             local_index = group_start + k * blockDim.x + threadIdx.x;
01142 
01143             tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
01144             val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
01145                                                                                               d_mat_row_start, d_mat_row_inc,
01146                                                                                               d_mat_col_start, d_mat_col_inc,
01147                                                                                               d_mat_internal_rows, d_mat_internal_cols)] : 0;
01148 
01149             //check for carry from previous loop run:
01150             if (threadIdx.x == 0 && k > 0)
01151             {
01152               if (tmp.x == shared_rows[blockDim.x-1])
01153                 val += inter_results[blockDim.x-1];
01154               else
01155                 result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
01156                                            result_row_start, result_row_inc,
01157                                            result_col_start, result_col_inc,
01158                                            result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
01159             }
01160 
01161             //segmented parallel reduction begin
01162             __syncthreads();
01163             shared_rows[threadIdx.x] = tmp.x;
01164             inter_results[threadIdx.x] = val;
01165             NumericT left = 0;
01166             __syncthreads();
01167 
01168             for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
01169             {
01170               left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
01171               __syncthreads();
01172               inter_results[threadIdx.x] += left;
01173               __syncthreads();
01174             }
01175             //segmented parallel reduction end
01176 
01177             if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
01178                 shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
01179             {
01180               result[ ResultIndexT::apply(tmp.x, result_col,
01181                                           result_row_start, result_row_inc,
01182                                           result_col_start, result_col_inc,
01183                                           result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
01184             }
01185 
01186             __syncthreads();
01187           } //for k
01188 
01189           if (local_index + 1 == group_end)
01190             result[ ResultIndexT::apply(tmp.x, result_col,
01191                                         result_row_start, result_row_inc,
01192                                         result_col_start, result_col_inc,
01193                                         result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
01194         }
01195       }
01196 
01205       template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2>
01206       void prod_impl(const viennacl::coordinate_matrix<ScalarType, ALIGNMENT> & sp_mat,
01207                      const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
01208                                                         const viennacl::matrix_base<NumericT, F1>,
01209                                                         viennacl::op_trans > & d_mat,
01210                            viennacl::matrix_base<NumericT, F2> & result) {
01211 
01212         coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<64, 128>>>
01213                                                           (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
01214                                                            detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
01215                                                            detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
01216 
01217                                                            detail::cuda_arg<NumericT>(d_mat.lhs()),
01218                                                            static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
01219                                                            static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
01220                                                            static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
01221                                                            static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
01222 
01223                                                            detail::cuda_arg<NumericT>(result),
01224                                                            static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
01225                                                            static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
01226                                                            static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
01227                                                            static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
01228                                                           );
01229 
01230         VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
01231       }
01232 
01233 
01234       //
01235       // ELL Matrix
01236       //
01237 
01238       template <typename T>
01239       __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
01240                                                 const T * elements,
01241                                                 const T * x,
01242                                                 unsigned int start_x,
01243                                                 unsigned int inc_x,
01244                                                       T * result,
01245                                                 unsigned int start_result,
01246                                                 unsigned int inc_result,
01247                                                 unsigned int row_num,
01248                                                 unsigned int col_num,
01249                                                 unsigned int internal_row_num,
01250                                                 unsigned int items_per_row,
01251                                                 unsigned int aligned_items_per_row
01252                                                )
01253       {
01254         unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
01255         unsigned int glb_sz = gridDim.x * blockDim.x;
01256 
01257         for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
01258         {
01259           T sum = 0;
01260 
01261           unsigned int offset = row_id;
01262           for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
01263           {
01264             T val = elements[offset];
01265 
01266             if(val != (T)0)
01267             {
01268               int col = coords[offset];
01269               sum += (x[col * inc_x + start_x] * val);
01270             }
01271           }
01272 
01273           result[row_id * inc_result + start_result] = sum;
01274         }
01275       }
01276 
01277 
01286       template<class ScalarType, unsigned int ALIGNMENT>
01287       void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & mat,
01288                      const viennacl::vector_base<ScalarType> & vec,
01289                            viennacl::vector_base<ScalarType> & result)
01290       {
01291         ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
01292                                                 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
01293                                                 detail::cuda_arg<ScalarType>(vec),
01294                                                 static_cast<unsigned int>(vec.start()),
01295                                                 static_cast<unsigned int>(vec.stride()),
01296                                                 detail::cuda_arg<ScalarType>(result),
01297                                                 static_cast<unsigned int>(result.start()),
01298                                                 static_cast<unsigned int>(result.stride()),
01299                                                 static_cast<unsigned int>(mat.size1()),
01300                                                 static_cast<unsigned int>(mat.size2()),
01301                                                 static_cast<unsigned int>(mat.internal_size1()),
01302                                                 static_cast<unsigned int>(mat.maxnnz()),
01303                                                 static_cast<unsigned int>(mat.internal_maxnnz())
01304                                                );
01305         VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
01306       }
01307 
01308       template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT >
01309       __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
01310                                                   const ScalarType * sp_mat_elements,
01311                                                   unsigned int sp_mat_row_num,
01312                                                   unsigned int sp_mat_col_num,
01313                                                   unsigned int sp_mat_internal_row_num,
01314                                                   unsigned int sp_mat_items_per_row,
01315                                                   unsigned int sp_mat_aligned_items_per_row,
01316                                                   const NumericT * d_mat,
01317                                                   unsigned int d_mat_row_start,
01318                                                   unsigned int d_mat_col_start,
01319                                                   unsigned int d_mat_row_inc,
01320                                                   unsigned int d_mat_col_inc,
01321                                                   unsigned int d_mat_row_size,
01322                                                   unsigned int d_mat_col_size,
01323                                                   unsigned int d_mat_internal_rows,
01324                                                   unsigned int d_mat_internal_cols,
01325                                                   NumericT * result,
01326                                                   unsigned int result_row_start,
01327                                                   unsigned int result_col_start,
01328                                                   unsigned int result_row_inc,
01329                                                   unsigned int result_col_inc,
01330                                                   unsigned int result_row_size,
01331                                                   unsigned int result_col_size,
01332                                                   unsigned int result_internal_rows,
01333                                                   unsigned int result_internal_cols) {
01334 
01335 
01336         unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
01337         unsigned int glb_sz = gridDim.x * blockDim.x;
01338         for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz) {
01339           unsigned int row = rc % sp_mat_row_num;
01340           unsigned int col = rc / sp_mat_row_num;
01341 
01342           unsigned int offset = row;
01343           NumericT r = (NumericT)0;
01344 
01345           for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
01346 
01347             unsigned int j = sp_mat_coords[offset];
01348             NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
01349 
01350             if(x != (NumericT)0) {
01351 
01352                 NumericT y = d_mat[ DMatIndexT::apply(j, col,
01353                                                       d_mat_row_start, d_mat_row_inc,
01354                                                       d_mat_col_start, d_mat_col_inc,
01355                                                       d_mat_internal_rows, d_mat_internal_cols) ];
01356 
01357                 r += x*y;
01358               }
01359             }
01360           result [ ResultIndexT::apply(row, col,
01361                                        result_row_start, result_row_inc,
01362                                        result_col_start, result_col_inc,
01363                                        result_internal_rows, result_internal_cols) ] = r;
01364         }
01365 
01366       }
01367 
01377       template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
01378       void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
01379                      const viennacl::matrix_base<NumericT, F1> & d_mat,
01380                            viennacl::matrix_base<NumericT, F2> & result) {
01381 
01382         ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
01383                                                  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
01384                                                   detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
01385                                                   static_cast<unsigned int>(sp_mat.size1()),
01386                                                   static_cast<unsigned int>(sp_mat.size2()),
01387                                                   static_cast<unsigned int>(sp_mat.internal_size1()),
01388                                                   static_cast<unsigned int>(sp_mat.maxnnz()),
01389                                                   static_cast<unsigned int>(sp_mat.internal_maxnnz()),
01390                                                   detail::cuda_arg<NumericT>(d_mat),
01391                                                   static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
01392                                                   static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
01393                                                   static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
01394                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
01395 
01396                                                   detail::cuda_arg<NumericT>(result),
01397                                                   static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
01398                                                   static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
01399                                                   static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
01400                                                   static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
01401                                                );
01402         VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
01403       }
01404 
01405       template <typename DMatIndexT, typename ResultIndexT, typename ScalarType, typename NumericT >
01406       __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
01407                                                   const ScalarType * sp_mat_elements,
01408                                                   unsigned int sp_mat_row_num,
01409                                                   unsigned int sp_mat_col_num,
01410                                                   unsigned int sp_mat_internal_row_num,
01411                                                   unsigned int sp_mat_items_per_row,
01412                                                   unsigned int sp_mat_aligned_items_per_row,
01413                                                   const NumericT * d_mat,
01414                                                   unsigned int d_mat_row_start,
01415                                                   unsigned int d_mat_col_start,
01416                                                   unsigned int d_mat_row_inc,
01417                                                   unsigned int d_mat_col_inc,
01418                                                   unsigned int d_mat_row_size,
01419                                                   unsigned int d_mat_col_size,
01420                                                   unsigned int d_mat_internal_rows,
01421                                                   unsigned int d_mat_internal_cols,
01422                                                   NumericT * result,
01423                                                   unsigned int result_row_start,
01424                                                   unsigned int result_col_start,
01425                                                   unsigned int result_row_inc,
01426                                                   unsigned int result_col_inc,
01427                                                   unsigned int result_row_size,
01428                                                   unsigned int result_col_size,
01429                                                   unsigned int result_internal_rows,
01430                                                   unsigned int result_internal_cols) {
01431 
01432 
01433         unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
01434         unsigned int glb_sz = gridDim.x * blockDim.x;
01435         for( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz) {
01436           unsigned int row = rc % sp_mat_row_num;
01437           unsigned int col = rc / sp_mat_row_num;
01438 
01439           unsigned int offset = row;
01440           NumericT r = (NumericT)0;
01441 
01442           for(unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num) {
01443 
01444             unsigned int j = sp_mat_coords[offset];
01445             NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
01446 
01447             if(x != (NumericT)0) {
01448 
01449                 NumericT y = d_mat[ DMatIndexT::apply(col, j,
01450                                                       d_mat_row_start, d_mat_row_inc,
01451                                                       d_mat_col_start, d_mat_col_inc,
01452                                                       d_mat_internal_rows, d_mat_internal_cols) ];
01453 
01454                 r += x*y;
01455               }
01456             }
01457           result [ ResultIndexT::apply(row, col,
01458                                        result_row_start, result_row_inc,
01459                                        result_col_start, result_col_inc,
01460                                        result_internal_rows, result_internal_cols) ] = r;
01461         }
01462 
01463       }
01464 
01474       template<class ScalarType, unsigned int ALIGNMENT, class NumericT, typename F1, typename F2 >
01475       void prod_impl(const viennacl::ell_matrix<ScalarType, ALIGNMENT> & sp_mat,
01476                      const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
01477                                                         const viennacl::matrix_base<NumericT, F1>,
01478                                                         viennacl::op_trans > & d_mat,
01479                            viennacl::matrix_base<NumericT, F2> & result) {
01480 
01481         ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<128, 128>>>
01482                                                     (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
01483                                                      detail::cuda_arg<ScalarType>(sp_mat.handle().cuda_handle()),
01484                                                      static_cast<unsigned int>(sp_mat.size1()),
01485                                                      static_cast<unsigned int>(sp_mat.size2()),
01486                                                      static_cast<unsigned int>(sp_mat.internal_size1()),
01487                                                      static_cast<unsigned int>(sp_mat.maxnnz()),
01488                                                      static_cast<unsigned int>(sp_mat.internal_maxnnz()),
01489 
01490                                                      detail::cuda_arg<NumericT>(d_mat.lhs()),
01491                                                      static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
01492                                                      static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
01493                                                      static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
01494                                                      static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
01495 
01496                                                      detail::cuda_arg<NumericT>(result),
01497                                                      static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
01498                                                      static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
01499                                                      static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
01500                                                      static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
01501                                                );
01502 
01503         VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
01504       }
01505 
01506       //
01507       // Hybrid Matrix
01508       //
01509 
01510 
01511       template <typename T>
01512       __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
01513                                                 const T * ell_elements,
01514                                                 const unsigned int * csr_rows,
01515                                                 const unsigned int * csr_cols,
01516                                                 const T * csr_elements,
01517                                                 const T * x,
01518                                                 unsigned int start_x,
01519                                                 unsigned int inc_x,
01520                                                       T * result,
01521                                                 unsigned int start_result,
01522                                                 unsigned int inc_result,
01523                                                 unsigned int row_num,
01524                                                 unsigned int internal_row_num,
01525                                                 unsigned int items_per_row,
01526                                                 unsigned int aligned_items_per_row
01527                                                )
01528       {
01529         unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
01530         unsigned int glb_sz = gridDim.x * blockDim.x;
01531 
01532         for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
01533         {
01534           T sum = 0;
01535 
01536           unsigned int offset = row_id;
01537           for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
01538           {
01539             T val = ell_elements[offset];
01540 
01541 
01542             if(val != 0.0f)
01543             {
01544               int col = ell_coords[offset];
01545               sum += (x[col * inc_x + start_x] * val);
01546             }
01547           }
01548 
01549           unsigned int col_begin = csr_rows[row_id];
01550           unsigned int col_end   = csr_rows[row_id + 1];
01551 
01552           for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
01553           {
01554             sum += (x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id]);
01555           }
01556 
01557           result[row_id * inc_result + start_result] = sum;
01558         }
01559       }
01560 
01561 
01562 
01571       template<class ScalarType, unsigned int ALIGNMENT>
01572       void prod_impl(const viennacl::hyb_matrix<ScalarType, ALIGNMENT> & mat,
01573                      const viennacl::vector_base<ScalarType> & vec,
01574                            viennacl::vector_base<ScalarType> & result)
01575       {
01576         hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
01577                                                 detail::cuda_arg<ScalarType>(mat.handle().cuda_handle()),
01578                                                 detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
01579                                                 detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
01580                                                 detail::cuda_arg<ScalarType>(mat.handle5().cuda_handle()),
01581                                                 detail::cuda_arg<ScalarType>(vec),
01582                                                 static_cast<unsigned int>(vec.start()),
01583                                                 static_cast<unsigned int>(vec.stride()),
01584                                                 detail::cuda_arg<ScalarType>(result),
01585                                                 static_cast<unsigned int>(result.start()),
01586                                                 static_cast<unsigned int>(result.stride()),
01587                                                 static_cast<unsigned int>(mat.size1()),
01588                                                 static_cast<unsigned int>(mat.internal_size1()),
01589                                                 static_cast<unsigned int>(mat.ell_nnz()),
01590                                                 static_cast<unsigned int>(mat.internal_ellnnz())
01591                                                );
01592         VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
01593       }
01594 
01595 
01596 
01597       template <typename DMatIndexT, typename ResultIndexT, typename NumericT>
01598       __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
01599                                                 const NumericT * ell_elements,
01600                                                 const unsigned int * csr_rows,
01601                                                 const unsigned int * csr_cols,
01602                                                 const NumericT * csr_elements,
01603                                                 unsigned int row_num,
01604                                                 unsigned int internal_row_num,
01605                                                 unsigned int items_per_row,
01606                                                 unsigned int aligned_items_per_row,
01607                                                 const NumericT * d_mat,
01608                                                 unsigned int d_mat_row_start,
01609                                                 unsigned int d_mat_col_start,
01610                                                 unsigned int d_mat_row_inc,
01611                                                 unsigned int d_mat_col_inc,
01612                                                 unsigned int d_mat_row_size,
01613                                                 unsigned int d_mat_col_size,
01614                                                 unsigned int d_mat_internal_rows,
01615                                                 unsigned int d_mat_internal_cols,
01616                                                 NumericT * result,
01617                                                 unsigned int result_row_start,
01618                                                 unsigned int result_col_start,
01619                                                 unsigned int result_row_inc,
01620                                                 unsigned int result_col_inc,
01621                                                 unsigned int result_row_size,
01622                                                 unsigned int result_col_size,
01623                                                 unsigned int result_internal_rows,
01624                                                 unsigned int result_internal_cols)
01625       {
01626         unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
01627         unsigned int glb_sz = gridDim.x * blockDim.x;
01628 
01629         for(unsigned int result_col = 0; result_col < result_col_size; ++result_col)
01630         {
01631           for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
01632           {
01633             NumericT sum = 0;
01634 
01635             unsigned int offset = row_id;
01636             for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
01637             {
01638               NumericT val = ell_elements[offset];
01639 
01640               if(val != 0.0f)
01641               {
01642                 sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
01643                                                d_mat_row_start, d_mat_row_inc,
01644                                                d_mat_col_start, d_mat_col_inc,
01645                                                d_mat_internal_rows, d_mat_internal_cols)] * val;
01646               }
01647             }
01648 
01649             unsigned int col_begin = csr_rows[row_id];
01650             unsigned int col_end   = csr_rows[row_id + 1];
01651 
01652             for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
01653             {
01654               sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
01655                                              d_mat_row_start, d_mat_row_inc,
01656                                              d_mat_col_start, d_mat_col_inc,
01657                                              d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
01658             }
01659 
01660             result[ResultIndexT::apply(row_id, result_col,
01661                                        result_row_start, result_row_inc,
01662                                        result_col_start, result_col_inc,
01663                                        result_internal_rows, result_internal_cols)] = sum;
01664           }
01665         }
01666       }
01667 
01668 
01669 
01678       template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
01679       void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat,
01680                      const viennacl::matrix_base<NumericT, F1> & d_mat,
01681                            viennacl::matrix_base<NumericT, F2> & result)
01682       {
01683         hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>(
01684           detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
01685           detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
01686           detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
01687           detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
01688           detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
01689           static_cast<unsigned int>(mat.size1()),
01690           static_cast<unsigned int>(mat.internal_size1()),
01691           static_cast<unsigned int>(mat.ell_nnz()),
01692           static_cast<unsigned int>(mat.internal_ellnnz()),
01693 
01694           detail::cuda_arg<NumericT>(d_mat),
01695           static_cast<unsigned int>(viennacl::traits::start1(d_mat)),         static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
01696           static_cast<unsigned int>(viennacl::traits::stride1(d_mat)),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
01697           static_cast<unsigned int>(viennacl::traits::size1(d_mat)),          static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
01698           static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
01699 
01700           detail::cuda_arg<NumericT>(result),
01701           static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
01702           static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
01703           static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
01704           static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
01705          );
01706         VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
01707       }
01708 
01709 
01710 
01711       template <typename DMatIndexT, typename ResultIndexT, typename NumericT>
01712       __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
01713                                                 const NumericT * ell_elements,
01714                                                 const unsigned int * csr_rows,
01715                                                 const unsigned int * csr_cols,
01716                                                 const NumericT * csr_elements,
01717                                                 unsigned int row_num,
01718                                                 unsigned int internal_row_num,
01719                                                 unsigned int items_per_row,
01720                                                 unsigned int aligned_items_per_row,
01721                                                 const NumericT * d_mat,
01722                                                 unsigned int d_mat_row_start,
01723                                                 unsigned int d_mat_col_start,
01724                                                 unsigned int d_mat_row_inc,
01725                                                 unsigned int d_mat_col_inc,
01726                                                 unsigned int d_mat_row_size,
01727                                                 unsigned int d_mat_col_size,
01728                                                 unsigned int d_mat_internal_rows,
01729                                                 unsigned int d_mat_internal_cols,
01730                                                 NumericT * result,
01731                                                 unsigned int result_row_start,
01732                                                 unsigned int result_col_start,
01733                                                 unsigned int result_row_inc,
01734                                                 unsigned int result_col_inc,
01735                                                 unsigned int result_row_size,
01736                                                 unsigned int result_col_size,
01737                                                 unsigned int result_internal_rows,
01738                                                 unsigned int result_internal_cols)
01739       {
01740         unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
01741         unsigned int glb_sz = gridDim.x * blockDim.x;
01742 
01743         for(unsigned int result_col = 0; result_col < result_col_size; ++result_col)
01744         {
01745           for(unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
01746           {
01747             NumericT sum = 0;
01748 
01749             unsigned int offset = row_id;
01750             for(unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
01751             {
01752               NumericT val = ell_elements[offset];
01753 
01754               if(val != 0.0f)
01755               {
01756                 sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
01757                                                d_mat_row_start, d_mat_row_inc,
01758                                                d_mat_col_start, d_mat_col_inc,
01759                                                d_mat_internal_rows, d_mat_internal_cols)] * val;
01760               }
01761             }
01762 
01763             unsigned int col_begin = csr_rows[row_id];
01764             unsigned int col_end   = csr_rows[row_id + 1];
01765 
01766             for(unsigned int item_id = col_begin; item_id < col_end; item_id++)
01767             {
01768               sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
01769                                              d_mat_row_start, d_mat_row_inc,
01770                                              d_mat_col_start, d_mat_col_inc,
01771                                              d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
01772             }
01773 
01774             result[ResultIndexT::apply(row_id, result_col,
01775                                        result_row_start, result_row_inc,
01776                                        result_col_start, result_col_inc,
01777                                        result_internal_rows, result_internal_cols)] = sum;
01778           }
01779         }
01780       }
01781 
01782 
01783 
01792       template<typename NumericT, unsigned int ALIGNMENT, typename F1, typename F2>
01793       void prod_impl(const viennacl::hyb_matrix<NumericT, ALIGNMENT> & mat,
01794                      const viennacl::matrix_expression< const viennacl::matrix_base<NumericT, F1>,
01795                                                         const viennacl::matrix_base<NumericT, F1>,
01796                                                         viennacl::op_trans > & d_mat,
01797                            viennacl::matrix_base<NumericT, F2> & result)
01798       {
01799         hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<F1>, mat_mult_matrix_index<F2> ><<<256, 128>>>(
01800           detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
01801           detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
01802           detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
01803           detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
01804           detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
01805           static_cast<unsigned int>(mat.size1()),
01806           static_cast<unsigned int>(mat.internal_size1()),
01807           static_cast<unsigned int>(mat.ell_nnz()),
01808           static_cast<unsigned int>(mat.internal_ellnnz()),
01809 
01810           detail::cuda_arg<NumericT>(d_mat.lhs()),
01811           static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())),         static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
01812           static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())),        static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
01813           static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())),          static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
01814           static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
01815 
01816           detail::cuda_arg<NumericT>(result),
01817           static_cast<unsigned int>(viennacl::traits::start1(result)),         static_cast<unsigned int>(viennacl::traits::start2(result)),
01818           static_cast<unsigned int>(viennacl::traits::stride1(result)),        static_cast<unsigned int>(viennacl::traits::stride2(result)),
01819           static_cast<unsigned int>(viennacl::traits::size1(result)),          static_cast<unsigned int>(viennacl::traits::size2(result)),
01820           static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
01821          );
01822         VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
01823       }
01824 
01825 
01826     } // namespace cuda
01827   } //namespace linalg
01828 } //namespace viennacl
01829 
01830 
01831 #endif