ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
sparse_matrix_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_SPARSE_MATRIX_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
25 #include "viennacl/forwards.h"
26 #include "viennacl/scalar.hpp"
27 #include "viennacl/vector.hpp"
28 #include "viennacl/tools/tools.hpp"
30 
32 
33 namespace viennacl
34 {
35 namespace linalg
36 {
37 namespace cuda
38 {
39 //
40 // Compressed matrix
41 //
42 
43 namespace detail
44 {
45 
46  template<typename NumericT>
48  const unsigned int * row_indices,
49  const unsigned int * column_indices,
50  const NumericT * elements,
51  NumericT * result,
52  unsigned int size,
53  unsigned int option)
54  {
55  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
56  row < size;
57  row += gridDim.x * blockDim.x)
58  {
59  NumericT value = 0;
60  unsigned int row_end = row_indices[row+1];
61 
62  switch (option)
63  {
64  case 0: //inf-norm
65  for (unsigned int i = row_indices[row]; i < row_end; ++i)
66  value = max(value, fabs(elements[i]));
67  break;
68 
69  case 1: //1-norm
70  for (unsigned int i = row_indices[row]; i < row_end; ++i)
71  value += fabs(elements[i]);
72  break;
73 
74  case 2: //2-norm
75  for (unsigned int i = row_indices[row]; i < row_end; ++i)
76  value += elements[i] * elements[i];
77  value = sqrt(value);
78  break;
79 
80  case 3: //diagonal entry
81  for (unsigned int i = row_indices[row]; i < row_end; ++i)
82  {
83  if (column_indices[i] == row)
84  {
85  value = elements[i];
86  break;
87  }
88  }
89  break;
90 
91  default:
92  break;
93  }
94  result[row] = value;
95  }
96  }
97 
98 
99  template<typename NumericT, unsigned int AligmentV>
101  vector_base<NumericT> & vec,
103  {
104  csr_row_info_extractor_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
105  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
106  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
107  detail::cuda_arg<NumericT>(vec),
108  static_cast<unsigned int>(mat.size1()),
109  static_cast<unsigned int>(info_selector)
110  );
111  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_row_info_extractor_kernel");
112  }
113 
114 } //namespace detail
115 
116 
117 template<typename NumericT>
119  const unsigned int * row_indices,
120  const unsigned int * column_indices,
121  const NumericT * elements,
122  const NumericT * x,
123  unsigned int start_x,
124  unsigned int inc_x,
125  NumericT * result,
126  unsigned int start_result,
127  unsigned int inc_result,
128  unsigned int size_result)
129 {
130  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
131  row < size_result;
132  row += gridDim.x * blockDim.x)
133  {
134  NumericT dot_prod = NumericT(0);
135  unsigned int row_end = row_indices[row+1];
136  for (unsigned int i = row_indices[row]; i < row_end; ++i)
137  dot_prod += elements[i] * x[column_indices[i] * inc_x + start_x];
138  result[row * inc_result + start_result] = dot_prod;
139  }
140 }
141 
142 
143 
144 
145 template<typename NumericT>
147  const unsigned int * row_indices,
148  const unsigned int * column_indices,
149  const unsigned int * row_blocks,
150  const NumericT * elements,
151  unsigned int num_blocks,
152  const NumericT * x,
153  unsigned int start_x,
154  unsigned int inc_x,
155  NumericT * result,
156  unsigned int start_result,
157  unsigned int inc_result,
158  unsigned int size_result)
159 {
160  __shared__ NumericT shared_elements[1024];
161 
162  for (unsigned int block_id = blockIdx.x; block_id < num_blocks; block_id += gridDim.x)
163  {
164  unsigned int row_start = row_blocks[block_id];
165  unsigned int row_stop = row_blocks[block_id + 1];
166  unsigned int element_start = row_indices[row_start];
167  unsigned int element_stop = row_indices[row_stop];
168  unsigned int rows_to_process = row_stop - row_start;
169 
170  if (rows_to_process > 1) // CSR stream with one thread per row
171  {
172  // load to shared buffer:
173  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
174  shared_elements[i - element_start] = elements[i] * x[column_indices[i] * inc_x + start_x];
175 
176  __syncthreads();
177 
178  // use one thread per row to sum:
179  for (unsigned int row = row_start + threadIdx.x; row < row_stop; row += blockDim.x)
180  {
181  NumericT dot_prod = 0;
182  unsigned int thread_row_start = row_indices[row] - element_start;
183  unsigned int thread_row_stop = row_indices[row + 1] - element_start;
184  for (unsigned int i = thread_row_start; i < thread_row_stop; ++i)
185  dot_prod += shared_elements[i];
186  result[row * inc_result + start_result] = dot_prod;
187  }
188  }
189  // TODO here: Consider CSR vector for two to four rows (cf. OpenCL implementation. Experience on Fermi suggests that this may not be necessary)
190  else // CSR vector for a single row
191  {
192  // load and sum to shared buffer:
193  shared_elements[threadIdx.x] = 0;
194  for (unsigned int i = element_start + threadIdx.x; i < element_stop; i += blockDim.x)
195  shared_elements[threadIdx.x] += elements[i] * x[column_indices[i] * inc_x + start_x];
196 
197  // reduction to obtain final result
198  for (unsigned int stride = blockDim.x/2; stride > 0; stride /= 2)
199  {
200  __syncthreads();
201  if (threadIdx.x < stride)
202  shared_elements[threadIdx.x] += shared_elements[threadIdx.x+stride];
203  }
204 
205  if (threadIdx.x == 0)
206  result[row_start * inc_result + start_result] = shared_elements[0];
207  }
208 
209  __syncthreads(); // avoid race conditions
210  }
211 }
212 
213 
214 
215 
224 template<class NumericT, unsigned int AlignmentV>
228 {
229  compressed_matrix_vec_mul_adaptive_kernel<<<256, 256>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
230  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
231  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
232  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
233  static_cast<unsigned int>(mat.blocks1()),
234  detail::cuda_arg<NumericT>(vec),
235  static_cast<unsigned int>(vec.start()),
236  static_cast<unsigned int>(vec.stride()),
237  detail::cuda_arg<NumericT>(result),
238  static_cast<unsigned int>(result.start()),
239  static_cast<unsigned int>(result.stride()),
240  static_cast<unsigned int>(result.size())
241  );
242  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_vec_mul_adaptive_kernel");
243 }
244 
249 template<typename LayoutT>
251 {
252  static __device__ unsigned int apply(unsigned int i, unsigned int j,
253  unsigned int row_start, unsigned int row_inc,
254  unsigned int col_start, unsigned int col_inc,
255  unsigned int internal_rows, unsigned int internal_cols)
256  {
257  return (row_start + i * row_inc) * internal_cols + col_start + j * col_inc;
258  }
259 };
260 
262 template<>
263 struct mat_mult_matrix_index<viennacl::column_major>
264 {
265  static __device__ unsigned int apply(unsigned int i, unsigned int j,
266  unsigned int row_start, unsigned int row_inc,
267  unsigned int col_start, unsigned int col_inc,
268  unsigned int internal_rows, unsigned int internal_cols)
269  {
270  return (row_start + i * row_inc) + (col_start + j * col_inc) * internal_rows;
271  }
272 };
276 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
278  const unsigned int * sp_mat_row_indices,
279  const unsigned int * sp_mat_col_indices,
280  const NumericT * sp_mat_elements,
281  const NumericT * d_mat,
282  unsigned int d_mat_row_start,
283  unsigned int d_mat_col_start,
284  unsigned int d_mat_row_inc,
285  unsigned int d_mat_col_inc,
286  unsigned int d_mat_row_size,
287  unsigned int d_mat_col_size,
288  unsigned int d_mat_internal_rows,
289  unsigned int d_mat_internal_cols,
290  NumericT * result,
291  unsigned int result_row_start,
292  unsigned int result_col_start,
293  unsigned int result_row_inc,
294  unsigned int result_col_inc,
295  unsigned int result_row_size,
296  unsigned int result_col_size,
297  unsigned int result_internal_rows,
298  unsigned int result_internal_cols)
299 {
300  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
301  {
302  unsigned int row_start = sp_mat_row_indices[row];
303  unsigned int row_end = sp_mat_row_indices[row+1];
304 
305  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
306  {
307  NumericT r = 0;
308 
309  for (unsigned int k = row_start; k < row_end; k++)
310  {
311  unsigned int j = sp_mat_col_indices[k];
312  NumericT x = sp_mat_elements[k];
313  NumericT y = d_mat[ DMatIndexT::apply(j, col,
314  d_mat_row_start, d_mat_row_inc,
315  d_mat_col_start, d_mat_col_inc,
316  d_mat_internal_rows, d_mat_internal_cols) ];
317 
318  r += x * y;
319  }
320 
321  result[ResultIndexT::apply(row, col,
322  result_row_start, result_row_inc,
323  result_col_start, result_col_inc,
324  result_internal_rows, result_internal_cols)] = r;
325  }
326  }
327 }
328 
329 
338 template<typename NumericT, unsigned int AlignmentV>
340  const viennacl::matrix_base<NumericT> & d_mat,
342 {
343  if (d_mat.row_major() && result.row_major())
344  {
345  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
346  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
347  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
348  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
349 
350  detail::cuda_arg<NumericT>(d_mat),
351  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
352  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
353  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
354  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
355 
356  detail::cuda_arg<NumericT>(result),
357  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
358  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
359  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
360  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
361  );
362  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
363  }
364  else if (d_mat.row_major() && !result.row_major())
365  {
366  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
367  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
368  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
369  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
370 
371  detail::cuda_arg<NumericT>(d_mat),
372  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
373  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
374  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
375  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
376 
377  detail::cuda_arg<NumericT>(result),
378  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
379  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
380  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
381  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
382  );
383  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
384  }
385  else if (!d_mat.row_major() && result.row_major())
386  {
387  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
388  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
389  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
390  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
391 
392  detail::cuda_arg<NumericT>(d_mat),
393  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
394  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
395  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
396  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
397 
398  detail::cuda_arg<NumericT>(result),
399  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
400  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
401  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
402  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
403  );
404  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
405  }
406  else
407  {
408  compressed_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
409  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
410  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
411  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
412 
413  detail::cuda_arg<NumericT>(d_mat),
414  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
415  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
416  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
417  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
418 
419  detail::cuda_arg<NumericT>(result),
420  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
421  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
422  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
423  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
424  );
425  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_mat_mul_kernel");
426  }
427 }
428 
429 
430 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
432  const unsigned int * sp_mat_row_indices,
433  const unsigned int * sp_mat_col_indices,
434  const NumericT * sp_mat_elements,
435  const NumericT * d_mat,
436  unsigned int d_mat_row_start,
437  unsigned int d_mat_col_start,
438  unsigned int d_mat_row_inc,
439  unsigned int d_mat_col_inc,
440  unsigned int d_mat_row_size,
441  unsigned int d_mat_col_size,
442  unsigned int d_mat_internal_rows,
443  unsigned int d_mat_internal_cols,
444  NumericT * result,
445  unsigned int result_row_start,
446  unsigned int result_col_start,
447  unsigned int result_row_inc,
448  unsigned int result_col_inc,
449  unsigned int result_row_size,
450  unsigned int result_col_size,
451  unsigned int result_internal_rows,
452  unsigned int result_internal_cols)
453 {
454  for (unsigned int row = blockIdx.x; row < result_row_size; row += gridDim.x)
455  {
456  unsigned int row_start = sp_mat_row_indices[row];
457  unsigned int row_end = sp_mat_row_indices[row+1];
458 
459  for ( unsigned int col = threadIdx.x; col < result_col_size; col += blockDim.x)
460  {
461  NumericT r = 0;
462 
463  for (unsigned int k = row_start; k < row_end; k++)
464  {
465  unsigned int j = sp_mat_col_indices[k];
466  NumericT x = sp_mat_elements[k];
467  NumericT y = d_mat[ DMatIndexT::apply(col, j,
468  d_mat_row_start, d_mat_row_inc,
469  d_mat_col_start, d_mat_col_inc,
470  d_mat_internal_rows, d_mat_internal_cols) ];
471 
472  r += x * y;
473  }
474 
475  result [ ResultIndexT::apply(row, col,
476  result_row_start, result_row_inc,
477  result_col_start, result_col_inc,
478  result_internal_rows, result_internal_cols) ] = r;
479  }
480  }
481 
482 }
483 
493 template<typename NumericT, unsigned int AlignmentV>
497  viennacl::op_trans > & d_mat,
499 {
500 
501  if (d_mat.lhs().row_major() && result.row_major())
502  {
503  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
504  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
505  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
506  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
507 
508  detail::cuda_arg<NumericT>(d_mat.lhs()),
509  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
510  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
511  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
512  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
513 
514  detail::cuda_arg<NumericT>(result),
515  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
516  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
517  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
518  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
519  );
520  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
521  }
522  else if (d_mat.lhs().row_major() && !result.row_major())
523  {
524  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
525  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
526  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
527  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
528 
529  detail::cuda_arg<NumericT>(d_mat.lhs()),
530  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
531  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
532  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
533  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
534 
535  detail::cuda_arg<NumericT>(result),
536  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
537  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
538  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
539  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
540  );
541  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
542  }
543  else if (!d_mat.lhs().row_major() && result.row_major())
544  {
545  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
546  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
547  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
548  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
549 
550  detail::cuda_arg<NumericT>(d_mat.lhs()),
551  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
552  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
553  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
554  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
555 
556  detail::cuda_arg<NumericT>(result),
557  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
558  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
559  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
560  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
561  );
562  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
563  }
564  else
565  {
566  compressed_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
567  (detail::cuda_arg<unsigned int>(sp_mat.handle1().cuda_handle()),
568  detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
569  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
570 
571  detail::cuda_arg<NumericT>(d_mat.lhs()),
572  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
573  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
574  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
575  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
576 
577  detail::cuda_arg<NumericT>(result),
578  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
579  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
580  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
581  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
582  );
583  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_matrix_d_tr_mat_mul_kernel");
584  }
585 }
586 
587 
588 //
589 // triangular solves for compressed_matrix
590 //
591 
592 template<typename NumericT>
594  const unsigned int * row_indices,
595  const unsigned int * column_indices,
596  const NumericT * elements,
597  NumericT * result,
598  unsigned int size)
599 {
600  for (unsigned int row = blockDim.x * blockIdx.x + threadIdx.x;
601  row < size;
602  row += gridDim.x * blockDim.x)
603  {
604  NumericT diag = NumericT(0);
605  unsigned int row_end = row_indices[row+1];
606  for (unsigned int i = row_indices[row]; i < row_end; ++i)
607  {
608  unsigned int col_index = column_indices[i];
609  if (col_index == row)
610  {
611  diag = elements[i];
612  break;
613  }
614  }
615  result[row] = diag;
616  }
617 }
618 
619 
625 template<typename SparseMatrixT, typename NumericT>
627 inplace_solve(const SparseMatrixT & mat,
630 {
631  csr_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
632  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
633  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
634  detail::cuda_arg<NumericT>(vec),
635  static_cast<unsigned int>(mat.size1())
636  );
637  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_forward_kernel");
638 }
639 
640 
646 template<typename SparseMatrixT, typename NumericT>
648 inplace_solve(const SparseMatrixT & mat,
651 {
652  csr_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
653  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
654  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
655  detail::cuda_arg<NumericT>(vec),
656  static_cast<unsigned int>(mat.size1())
657  );
658  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_forward_kernel");
659 }
660 
661 
662 
668 template<typename SparseMatrixT, typename NumericT>
670 inplace_solve(const SparseMatrixT & mat,
673 {
674  csr_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
675  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
676  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
677  detail::cuda_arg<NumericT>(vec),
678  static_cast<unsigned int>(mat.size1())
679  );
680  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_unit_lu_backward_kernel");
681 }
682 
683 
689 template<typename SparseMatrixT, typename NumericT>
691 inplace_solve(const SparseMatrixT & mat,
694 {
695  csr_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
696  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
697  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
698  detail::cuda_arg<NumericT>(vec),
699  static_cast<unsigned int>(mat.size1())
700  );
701  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_lu_backward_kernel");
702 }
703 
704 
705 
706 // transposed
707 
713 template<typename SparseMatrixT, typename NumericT>
718 {
719  csr_trans_unit_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
720  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
721  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
722  detail::cuda_arg<NumericT>(vec),
723  static_cast<unsigned int>(mat.lhs().size1())
724  );
725  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_forward_kernel");
726 }
727 
728 
734 template<typename SparseMatrixT, typename NumericT>
739 {
740  viennacl::vector<NumericT> diagonal(vec.size());
741 
742  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
743  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
744  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
745  detail::cuda_arg<NumericT>(diagonal),
746  static_cast<unsigned int>(mat.size1())
747  );
748 
749  csr_trans_lu_forward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
750  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
751  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
752  detail::cuda_arg<NumericT>(diagonal),
753  detail::cuda_arg<NumericT>(vec),
754  static_cast<unsigned int>(mat.lhs().size1())
755  );
756  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_forward_kernel");
757 }
758 
759 
765 template<typename SparseMatrixT, typename NumericT>
770 {
771  csr_trans_unit_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
772  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
773  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
774  detail::cuda_arg<NumericT>(vec),
775  static_cast<unsigned int>(mat.lhs().size1())
776  );
777  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_unit_lu_backward_kernel");
778 }
779 
780 
786 template<typename SparseMatrixT, typename NumericT>
791 {
792  viennacl::vector<NumericT> diagonal(vec.size());
793 
794  compressed_matrix_diagonal_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
795  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
796  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
797  detail::cuda_arg<NumericT>(diagonal),
798  static_cast<unsigned int>(mat.size1())
799  );
800 
801  csr_trans_lu_backward_kernel<<<1, 128>>>(detail::cuda_arg<unsigned int>(mat.lhs().handle1().cuda_handle()),
802  detail::cuda_arg<unsigned int>(mat.lhs().handle2().cuda_handle()),
803  detail::cuda_arg<NumericT>(mat.lhs().handle().cuda_handle()),
804  detail::cuda_arg<NumericT>(diagonal),
805  detail::cuda_arg<NumericT>(vec),
806  static_cast<unsigned int>(mat.lhs().size1())
807  );
808  VIENNACL_CUDA_LAST_ERROR_CHECK("csr_trans_lu_backward_kernel");
809 }
810 
811 namespace detail
812 {
813  //
814  // block solves
815  //
816  template<typename NumericT, unsigned int AlignmentV>
819  op_trans> & L,
820  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
821  vector_base<NumericT> const & /* L_diagonal */, //ignored
822  vector_base<NumericT> & vec,
824  {
825  csr_block_trans_unit_lu_forward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(L.lhs().handle1().cuda_handle()),
826  detail::cuda_arg<unsigned int>(L.lhs().handle2().cuda_handle()),
827  detail::cuda_arg<NumericT>(L.lhs().handle().cuda_handle()),
828  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
829  detail::cuda_arg<NumericT>(vec),
830  static_cast<unsigned int>(L.lhs().size1())
831  );
832  }
833 
834 
835  template<typename NumericT, unsigned int AlignmentV>
838  op_trans> & U,
839  viennacl::backend::mem_handle const & block_indices, vcl_size_t num_blocks,
840  vector_base<NumericT> const & U_diagonal,
841  vector_base<NumericT> & vec,
843  {
844  csr_block_trans_lu_backward<<<num_blocks, 128>>>(detail::cuda_arg<unsigned int>(U.lhs().handle1().cuda_handle()),
845  detail::cuda_arg<unsigned int>(U.lhs().handle2().cuda_handle()),
846  detail::cuda_arg<NumericT>(U.lhs().handle().cuda_handle()),
847  detail::cuda_arg<NumericT>(U_diagonal.handle().cuda_handle()),
848  detail::cuda_arg<unsigned int>(block_indices.cuda_handle()),
849  detail::cuda_arg<NumericT>(vec),
850  static_cast<unsigned int>(U.lhs().size1())
851  );
852  }
853 
854 
855 }
856 
857 
858 //
859 // Compressed Compressed Matrix
860 //
861 
862 template<typename NumericT>
864  const unsigned int * row_jumper,
865  const unsigned int * row_indices,
866  const unsigned int * column_indices,
867  const NumericT * elements,
868  unsigned int nonzero_rows,
869  const NumericT * x,
870  unsigned int start_x,
871  unsigned int inc_x,
872  NumericT * result,
873  unsigned int start_result,
874  unsigned int inc_result,
875  unsigned int size_result)
876 {
877  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
878  i < size_result;
879  i += gridDim.x * blockDim.x)
880  {
881  result[i * inc_result + start_result] = 0;
882  }
883 
884  for (unsigned int i = blockDim.x * blockIdx.x + threadIdx.x;
885  i < nonzero_rows;
886  i += gridDim.x * blockDim.x)
887  {
888  NumericT dot_prod = NumericT(0);
889  unsigned int row_end = row_jumper[i+1];
890  for (unsigned int j = row_jumper[i]; j < row_end; ++j)
891  dot_prod += elements[j] * x[column_indices[j] * inc_x + start_x];
892  result[row_indices[i] * inc_result + start_result] = dot_prod;
893  }
894 }
895 
896 
905 template<typename NumericT>
909 {
910  compressed_compressed_matrix_vec_mul_kernel<<<128, 128>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
911  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
912  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
913  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
914  static_cast<unsigned int>(mat.nnz1()),
915  detail::cuda_arg<NumericT>(vec),
916  static_cast<unsigned int>(vec.start()),
917  static_cast<unsigned int>(vec.stride()),
918  detail::cuda_arg<NumericT>(result),
919  static_cast<unsigned int>(result.start()),
920  static_cast<unsigned int>(result.stride()),
921  static_cast<unsigned int>(result.size())
922  );
923  VIENNACL_CUDA_LAST_ERROR_CHECK("compressed_compressed_matrix_vec_mul_kernel");
924 }
925 
926 //
927 // Coordinate Matrix
928 //
929 
930 
931 namespace detail
932 {
933 
934  template<typename NumericT>
935  __global__ void coo_row_info_extractor( const unsigned int * coords, //(row_index, column_index)
936  const NumericT * elements,
937  const unsigned int * group_boundaries,
938  NumericT * result,
939  unsigned int option)
940  {
941  __shared__ unsigned int shared_rows[128];
942  __shared__ NumericT inter_results[128];
943 
944  uint2 tmp;
945  NumericT val;
946  unsigned int last_index = blockDim.x - 1;
947  unsigned int group_start = group_boundaries[blockIdx.x];
948  unsigned int group_end = group_boundaries[blockIdx.x + 1];
949  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
950 
951  unsigned int local_index = 0;
952 
953  for (unsigned int k = 0; k < k_end; ++k)
954  {
955  local_index = group_start + k * blockDim.x + threadIdx.x;
956 
957  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
958  val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0;
959 
960  //check for carry from previous loop run:
961  if (threadIdx.x == 0 && k > 0)
962  {
963  if (tmp.x == shared_rows[last_index])
964  {
965  switch (option)
966  {
967  case 0: //inf-norm
968  case 3: //diagonal entry
969  val = max(val, fabs(inter_results[last_index]));
970  break;
971 
972  case 1: //1-norm
973  val = fabs(val) + inter_results[last_index];
974  break;
975 
976  case 2: //2-norm
977  val = sqrt(val * val + inter_results[last_index]);
978  break;
979 
980  default:
981  break;
982  }
983  }
984  else
985  {
986  switch (option)
987  {
988  case 0: //inf-norm
989  case 1: //1-norm
990  case 3: //diagonal entry
991  result[shared_rows[last_index]] = inter_results[last_index];
992  break;
993 
994  case 2: //2-norm
995  result[shared_rows[last_index]] = sqrt(inter_results[last_index]);
996  default:
997  break;
998  }
999  }
1000  }
1001 
1002  //segmented parallel reduction begin
1003  __syncthreads();
1004  shared_rows[threadIdx.x] = tmp.x;
1005  switch (option)
1006  {
1007  case 0:
1008  case 3:
1009  inter_results[threadIdx.x] = val;
1010  break;
1011  case 1:
1012  inter_results[threadIdx.x] = fabs(val);
1013  break;
1014  case 2:
1015  inter_results[threadIdx.x] = val * val;
1016  default:
1017  break;
1018  }
1019  __syncthreads();
1020 
1021  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1022  {
1023  NumericT left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1024  __syncthreads();
1025  switch (option)
1026  {
1027  case 0: //inf-norm
1028  case 3: //diagonal entry
1029  inter_results[threadIdx.x] = max(inter_results[threadIdx.x], left);
1030  break;
1031 
1032  case 1: //1-norm
1033  inter_results[threadIdx.x] += left;
1034  break;
1035 
1036  case 2: //2-norm
1037  inter_results[threadIdx.x] += left;
1038  break;
1039 
1040  default:
1041  break;
1042  }
1043  __syncthreads();
1044  }
1045  //segmented parallel reduction end
1046 
1047  if (threadIdx.x != last_index &&
1048  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1] &&
1049  inter_results[threadIdx.x] != 0)
1050  {
1051  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1052  }
1053 
1054  __syncthreads();
1055  } //for k
1056 
1057  if (local_index + 1 == group_end && inter_results[threadIdx.x] != 0)
1058  result[tmp.x] = (option == 2) ? sqrt(inter_results[threadIdx.x]) : inter_results[threadIdx.x];
1059  }
1060 
1061  template<typename NumericT, unsigned int AlignmentV>
1063  vector_base<NumericT> & vec,
1065  {
1066  coo_row_info_extractor<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
1067  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1068  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1069  detail::cuda_arg<NumericT>(vec),
1070  static_cast<unsigned int>(info_selector)
1071  );
1072  VIENNACL_CUDA_LAST_ERROR_CHECK("coo_row_info_extractor");
1073  }
1074 
1075 } //namespace detail
1076 
1077 
1078 template<typename NumericT>
1079 __global__ void coordinate_matrix_vec_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1080  const NumericT * elements,
1081  const unsigned int * group_boundaries,
1082  const NumericT * x,
1083  unsigned int start_x,
1084  unsigned int inc_x,
1085  NumericT * result,
1086  unsigned int start_result,
1087  unsigned int inc_result
1088  )
1089 {
1090  __shared__ unsigned int shared_rows[128];
1091  __shared__ NumericT inter_results[128];
1092 
1093  uint2 tmp;
1094  NumericT val;
1095  unsigned int group_start = group_boundaries[blockIdx.x];
1096  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1097  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1098 
1099  unsigned int local_index = 0;
1100 
1101  for (unsigned int k = 0; k < k_end; ++k)
1102  {
1103  local_index = group_start + k * blockDim.x + threadIdx.x;
1104 
1105  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1106  val = (local_index < group_end) ? elements[local_index] * x[tmp.y * inc_x + start_x] : 0;
1107 
1108  //check for carry from previous loop run:
1109  if (threadIdx.x == 0 && k > 0)
1110  {
1111  if (tmp.x == shared_rows[blockDim.x-1])
1112  val += inter_results[blockDim.x-1];
1113  else
1114  result[shared_rows[blockDim.x-1] * inc_result + start_result] = inter_results[blockDim.x-1];
1115  }
1116 
1117  //segmented parallel reduction begin
1118  __syncthreads();
1119  shared_rows[threadIdx.x] = tmp.x;
1120  inter_results[threadIdx.x] = val;
1121  NumericT left = 0;
1122  __syncthreads();
1123 
1124  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1125  {
1126  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1127  __syncthreads();
1128  inter_results[threadIdx.x] += left;
1129  __syncthreads();
1130  }
1131  //segmented parallel reduction end
1132 
1133  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1134  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1135  {
1136  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1137  }
1138 
1139  __syncthreads();
1140  } //for k
1141 
1142  if (local_index + 1 == group_end)
1143  result[tmp.x * inc_result + start_result] = inter_results[threadIdx.x];
1144 }
1145 
1146 
1155 template<typename NumericT, unsigned int AlignmentV>
1157  const viennacl::vector_base<NumericT> & vec,
1159 {
1160  result.clear();
1161 
1162  coordinate_matrix_vec_mul_kernel<<<64, 128>>>(detail::cuda_arg<unsigned int>(mat.handle12().cuda_handle()),
1163  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1164  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
1165  detail::cuda_arg<NumericT>(vec),
1166  static_cast<unsigned int>(vec.start()),
1167  static_cast<unsigned int>(vec.stride()),
1168  detail::cuda_arg<NumericT>(result),
1169  static_cast<unsigned int>(result.start()),
1170  static_cast<unsigned int>(result.stride())
1171  );
1172  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_vec_mul_kernel");
1173 }
1174 
1175 
1176 
1177 
1178 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1179 __global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1180  const NumericT * elements,
1181  const unsigned int * group_boundaries,
1182  const NumericT * d_mat,
1183  unsigned int d_mat_row_start,
1184  unsigned int d_mat_col_start,
1185  unsigned int d_mat_row_inc,
1186  unsigned int d_mat_col_inc,
1187  unsigned int d_mat_row_size,
1188  unsigned int d_mat_col_size,
1189  unsigned int d_mat_internal_rows,
1190  unsigned int d_mat_internal_cols,
1191  NumericT * result,
1192  unsigned int result_row_start,
1193  unsigned int result_col_start,
1194  unsigned int result_row_inc,
1195  unsigned int result_col_inc,
1196  unsigned int result_row_size,
1197  unsigned int result_col_size,
1198  unsigned int result_internal_rows,
1199  unsigned int result_internal_cols)
1200 {
1201  __shared__ unsigned int shared_rows[128];
1202  __shared__ NumericT inter_results[128];
1203 
1204  uint2 tmp;
1205  NumericT val;
1206  unsigned int group_start = group_boundaries[blockIdx.x];
1207  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1208  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1209 
1210  unsigned int local_index = 0;
1211 
1212  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1213  {
1214  for (unsigned int k = 0; k < k_end; ++k)
1215  {
1216  local_index = group_start + k * blockDim.x + threadIdx.x;
1217 
1218  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1219  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(tmp.y, result_col,
1220  d_mat_row_start, d_mat_row_inc,
1221  d_mat_col_start, d_mat_col_inc,
1222  d_mat_internal_rows, d_mat_internal_cols) ] : 0;
1223 
1224  //check for carry from previous loop run:
1225  if (threadIdx.x == 0 && k > 0)
1226  {
1227  if (tmp.x == shared_rows[blockDim.x-1])
1228  val += inter_results[blockDim.x-1];
1229  else
1230  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1231  result_row_start, result_row_inc,
1232  result_col_start, result_col_inc,
1233  result_internal_rows, result_internal_cols)] = inter_results[blockDim.x-1];
1234  }
1235 
1236  //segmented parallel reduction begin
1237  __syncthreads();
1238  shared_rows[threadIdx.x] = tmp.x;
1239  inter_results[threadIdx.x] = val;
1240  NumericT left = 0;
1241  __syncthreads();
1242 
1243  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1244  {
1245  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1246  __syncthreads();
1247  inter_results[threadIdx.x] += left;
1248  __syncthreads();
1249  }
1250  //segmented parallel reduction end
1251 
1252  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1253  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1254  {
1255  result[ResultIndexT::apply(tmp.x, result_col,
1256  result_row_start, result_row_inc,
1257  result_col_start, result_col_inc,
1258  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1259  }
1260 
1261  __syncthreads();
1262  } //for k
1263 
1264  if (local_index + 1 == group_end)
1265  result[ResultIndexT::apply(tmp.x, result_col,
1266  result_row_start, result_row_inc,
1267  result_col_start, result_col_inc,
1268  result_internal_rows, result_internal_cols)] = inter_results[threadIdx.x];
1269  }
1270 }
1271 
1272 
1281 template<typename NumericT, unsigned int AlignmentV>
1283  const viennacl::matrix_base<NumericT> & d_mat,
1285 {
1286  if (d_mat.row_major() && result.row_major())
1287  {
1288  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1289  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1290  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1291  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1292 
1293  detail::cuda_arg<NumericT>(d_mat),
1294  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1295  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1296  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1297  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1298 
1299  detail::cuda_arg<NumericT>(result),
1300  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1301  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1302  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1303  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1304  );
1305  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1306  }
1307  else if (d_mat.row_major() && !result.row_major())
1308  {
1309  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1310  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1311  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1312  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1313 
1314  detail::cuda_arg<NumericT>(d_mat),
1315  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1316  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1317  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1318  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1319 
1320  detail::cuda_arg<NumericT>(result),
1321  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1322  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1323  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1324  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1325  );
1326  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1327  }
1328  else if (!d_mat.row_major() && result.row_major())
1329  {
1330  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1331  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1332  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1333  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1334 
1335  detail::cuda_arg<NumericT>(d_mat),
1336  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1337  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1338  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1339  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1340 
1341  detail::cuda_arg<NumericT>(result),
1342  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1343  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1344  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1345  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1346  );
1347  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1348  }
1349  else
1350  {
1351  coordinate_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1352  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1353  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1354  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1355 
1356  detail::cuda_arg<NumericT>(d_mat),
1357  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1358  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1359  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1360  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1361 
1362  detail::cuda_arg<NumericT>(result),
1363  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1364  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1365  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1366  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1367  );
1368  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_mat_mul_kernel");
1369  }
1370 
1371 }
1372 
1373 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1374 __global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int * coords, //(row_index, column_index)
1375  const NumericT * elements,
1376  const unsigned int * group_boundaries,
1377  const NumericT * d_mat,
1378  unsigned int d_mat_row_start,
1379  unsigned int d_mat_col_start,
1380  unsigned int d_mat_row_inc,
1381  unsigned int d_mat_col_inc,
1382  unsigned int d_mat_row_size,
1383  unsigned int d_mat_col_size,
1384  unsigned int d_mat_internal_rows,
1385  unsigned int d_mat_internal_cols,
1386  NumericT * result,
1387  unsigned int result_row_start,
1388  unsigned int result_col_start,
1389  unsigned int result_row_inc,
1390  unsigned int result_col_inc,
1391  unsigned int result_row_size,
1392  unsigned int result_col_size,
1393  unsigned int result_internal_rows,
1394  unsigned int result_internal_cols)
1395 {
1396  __shared__ unsigned int shared_rows[128];
1397  __shared__ NumericT inter_results[128];
1398 
1399  uint2 tmp;
1400  NumericT val;
1401  unsigned int group_start = group_boundaries[blockIdx.x];
1402  unsigned int group_end = group_boundaries[blockIdx.x + 1];
1403  unsigned int k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / blockDim.x : 0; // -1 in order to have correct behavior if group_end - group_start == j * blockDim.x
1404 
1405  unsigned int local_index = 0;
1406 
1407  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
1408  {
1409  for (unsigned int k = 0; k < k_end; ++k)
1410  {
1411  local_index = group_start + k * blockDim.x + threadIdx.x;
1412 
1413  tmp = (local_index < group_end) ? ((const uint2 *)coords)[local_index] : ::make_uint2(0, 0);
1414  val = (local_index < group_end) ? elements[local_index] * d_mat[DMatIndexT::apply(result_col, tmp.y,
1415  d_mat_row_start, d_mat_row_inc,
1416  d_mat_col_start, d_mat_col_inc,
1417  d_mat_internal_rows, d_mat_internal_cols)] : 0;
1418 
1419  //check for carry from previous loop run:
1420  if (threadIdx.x == 0 && k > 0)
1421  {
1422  if (tmp.x == shared_rows[blockDim.x-1])
1423  val += inter_results[blockDim.x-1];
1424  else
1425  result[ResultIndexT::apply(shared_rows[blockDim.x-1], result_col,
1426  result_row_start, result_row_inc,
1427  result_col_start, result_col_inc,
1428  result_internal_rows, result_internal_cols) ] = inter_results[blockDim.x-1];
1429  }
1430 
1431  //segmented parallel reduction begin
1432  __syncthreads();
1433  shared_rows[threadIdx.x] = tmp.x;
1434  inter_results[threadIdx.x] = val;
1435  NumericT left = 0;
1436  __syncthreads();
1437 
1438  for (unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1439  {
1440  left = (threadIdx.x >= stride && tmp.x == shared_rows[threadIdx.x - stride]) ? inter_results[threadIdx.x - stride] : 0;
1441  __syncthreads();
1442  inter_results[threadIdx.x] += left;
1443  __syncthreads();
1444  }
1445  //segmented parallel reduction end
1446 
1447  if (local_index < group_end && threadIdx.x < blockDim.x-1 &&
1448  shared_rows[threadIdx.x] != shared_rows[threadIdx.x + 1])
1449  {
1450  result[ ResultIndexT::apply(tmp.x, result_col,
1451  result_row_start, result_row_inc,
1452  result_col_start, result_col_inc,
1453  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1454  }
1455 
1456  __syncthreads();
1457  } //for k
1458 
1459  if (local_index + 1 == group_end)
1460  result[ ResultIndexT::apply(tmp.x, result_col,
1461  result_row_start, result_row_inc,
1462  result_col_start, result_col_inc,
1463  result_internal_rows, result_internal_cols) ] = inter_results[threadIdx.x];
1464  }
1465 }
1466 
1475 template<typename NumericT, unsigned int AlignmentV>
1479  viennacl::op_trans > & d_mat,
1481 {
1482  if (d_mat.lhs().row_major() && result.row_major())
1483  {
1484  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1485  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1486  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1487  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1488 
1489  detail::cuda_arg<NumericT>(d_mat.lhs()),
1490  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1491  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1492  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1493  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1494 
1495  detail::cuda_arg<NumericT>(result),
1496  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1497  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1498  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1499  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1500  );
1501  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1502  }
1503  else if (d_mat.lhs().row_major() && !result.row_major())
1504  {
1505  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1506  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1507  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1508  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1509 
1510  detail::cuda_arg<NumericT>(d_mat.lhs()),
1511  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1512  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1513  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1514  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1515 
1516  detail::cuda_arg<NumericT>(result),
1517  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1518  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1519  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1520  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1521  );
1522  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1523  }
1524  else if (!d_mat.lhs().row_major() && result.row_major())
1525  {
1526  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<64, 128>>>
1527  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1528  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1529  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1530 
1531  detail::cuda_arg<NumericT>(d_mat.lhs()),
1532  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1533  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1534  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1535  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1536 
1537  detail::cuda_arg<NumericT>(result),
1538  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1539  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1540  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1541  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1542  );
1543  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1544  }
1545  else
1546  {
1547  coordinate_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<64, 128>>>
1548  (detail::cuda_arg<unsigned int>(sp_mat.handle12().cuda_handle()),
1549  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1550  detail::cuda_arg<unsigned int>(sp_mat.handle3().cuda_handle()),
1551 
1552  detail::cuda_arg<NumericT>(d_mat.lhs()),
1553  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1554  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1555  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1556  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1557 
1558  detail::cuda_arg<NumericT>(result),
1559  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1560  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1561  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1562  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1563  );
1564  VIENNACL_CUDA_LAST_ERROR_CHECK("coordinate_matrix_d_tr_mat_mul_kernel");
1565  }
1566 }
1567 
1568 
1569 //
1570 // ELL Matrix
1571 //
1572 
1573 template<typename NumericT>
1574 __global__ void ell_matrix_vec_mul_kernel(const unsigned int * coords,
1575  const NumericT * elements,
1576  const NumericT * x,
1577  unsigned int start_x,
1578  unsigned int inc_x,
1579  NumericT * result,
1580  unsigned int start_result,
1581  unsigned int inc_result,
1582  unsigned int row_num,
1583  unsigned int col_num,
1584  unsigned int internal_row_num,
1585  unsigned int items_per_row,
1586  unsigned int aligned_items_per_row
1587  )
1588 {
1589  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1590  unsigned int glb_sz = gridDim.x * blockDim.x;
1591 
1592  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
1593  {
1594  NumericT sum = 0;
1595 
1596  unsigned int offset = row_id;
1597  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
1598  {
1599  NumericT val = elements[offset];
1600 
1601  if (val != NumericT(0))
1602  {
1603  int col = coords[offset];
1604  sum += x[col * inc_x + start_x] * val;
1605  }
1606  }
1607 
1608  result[row_id * inc_result + start_result] = sum;
1609  }
1610 }
1611 
1612 
1621 template<typename NumericT, unsigned int AlignmentV>
1623  const viennacl::vector_base<NumericT> & vec,
1625 {
1626  ell_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
1627  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
1628  detail::cuda_arg<NumericT>(vec),
1629  static_cast<unsigned int>(vec.start()),
1630  static_cast<unsigned int>(vec.stride()),
1631  detail::cuda_arg<NumericT>(result),
1632  static_cast<unsigned int>(result.start()),
1633  static_cast<unsigned int>(result.stride()),
1634  static_cast<unsigned int>(mat.size1()),
1635  static_cast<unsigned int>(mat.size2()),
1636  static_cast<unsigned int>(mat.internal_size1()),
1637  static_cast<unsigned int>(mat.maxnnz()),
1638  static_cast<unsigned int>(mat.internal_maxnnz())
1639  );
1640  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_vec_mul_kernel");
1641 }
1642 
1643 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
1644 __global__ void ell_matrix_d_mat_mul_kernel(const unsigned int * sp_mat_coords,
1645  const NumericT * sp_mat_elements,
1646  unsigned int sp_mat_row_num,
1647  unsigned int sp_mat_col_num,
1648  unsigned int sp_mat_internal_row_num,
1649  unsigned int sp_mat_items_per_row,
1650  unsigned int sp_mat_aligned_items_per_row,
1651  const NumericT * d_mat,
1652  unsigned int d_mat_row_start,
1653  unsigned int d_mat_col_start,
1654  unsigned int d_mat_row_inc,
1655  unsigned int d_mat_col_inc,
1656  unsigned int d_mat_row_size,
1657  unsigned int d_mat_col_size,
1658  unsigned int d_mat_internal_rows,
1659  unsigned int d_mat_internal_cols,
1660  NumericT * result,
1661  unsigned int result_row_start,
1662  unsigned int result_col_start,
1663  unsigned int result_row_inc,
1664  unsigned int result_col_inc,
1665  unsigned int result_row_size,
1666  unsigned int result_col_size,
1667  unsigned int result_internal_rows,
1668  unsigned int result_internal_cols)
1669 {
1670  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1671  unsigned int glb_sz = gridDim.x * blockDim.x;
1672 
1673  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_col_size); rc += glb_sz)
1674  {
1675  unsigned int row = rc % sp_mat_row_num;
1676  unsigned int col = rc / sp_mat_row_num;
1677 
1678  unsigned int offset = row;
1679  NumericT r = (NumericT)0;
1680 
1681  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1682  {
1683  unsigned int j = sp_mat_coords[offset];
1684  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1685 
1686  if (x != (NumericT)0)
1687  {
1688  NumericT y = d_mat[ DMatIndexT::apply(j, col,
1689  d_mat_row_start, d_mat_row_inc,
1690  d_mat_col_start, d_mat_col_inc,
1691  d_mat_internal_rows, d_mat_internal_cols) ];
1692 
1693  r += x*y;
1694  }
1695  }
1696  result [ ResultIndexT::apply(row, col,
1697  result_row_start, result_row_inc,
1698  result_col_start, result_col_inc,
1699  result_internal_rows, result_internal_cols) ] = r;
1700  }
1701 
1702 }
1703 
1713 template<typename NumericT, unsigned int AlignmentV>
1715  const viennacl::matrix_base<NumericT> & d_mat,
1717 {
1718  if (d_mat.row_major() && result.row_major())
1719  {
1720  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1721  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1722  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1723  static_cast<unsigned int>(sp_mat.size1()),
1724  static_cast<unsigned int>(sp_mat.size2()),
1725  static_cast<unsigned int>(sp_mat.internal_size1()),
1726  static_cast<unsigned int>(sp_mat.maxnnz()),
1727  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1728  detail::cuda_arg<NumericT>(d_mat),
1729  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1730  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1731  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1732  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1733 
1734  detail::cuda_arg<NumericT>(result),
1735  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1736  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1737  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1738  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1739  );
1740  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1741  }
1742  else if (d_mat.row_major() && !result.row_major())
1743  {
1744  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1745  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1746  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1747  static_cast<unsigned int>(sp_mat.size1()),
1748  static_cast<unsigned int>(sp_mat.size2()),
1749  static_cast<unsigned int>(sp_mat.internal_size1()),
1750  static_cast<unsigned int>(sp_mat.maxnnz()),
1751  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1752  detail::cuda_arg<NumericT>(d_mat),
1753  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1754  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1755  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1756  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1757 
1758  detail::cuda_arg<NumericT>(result),
1759  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1760  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1761  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1762  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1763  );
1764  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1765  }
1766  else if (!d_mat.row_major() && result.row_major())
1767  {
1768  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1769  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1770  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1771  static_cast<unsigned int>(sp_mat.size1()),
1772  static_cast<unsigned int>(sp_mat.size2()),
1773  static_cast<unsigned int>(sp_mat.internal_size1()),
1774  static_cast<unsigned int>(sp_mat.maxnnz()),
1775  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1776  detail::cuda_arg<NumericT>(d_mat),
1777  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1778  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1779  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1780  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1781 
1782  detail::cuda_arg<NumericT>(result),
1783  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1784  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1785  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1786  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1787  );
1788  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1789  }
1790  else
1791  {
1792  ell_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1793  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1794  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1795  static_cast<unsigned int>(sp_mat.size1()),
1796  static_cast<unsigned int>(sp_mat.size2()),
1797  static_cast<unsigned int>(sp_mat.internal_size1()),
1798  static_cast<unsigned int>(sp_mat.maxnnz()),
1799  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1800  detail::cuda_arg<NumericT>(d_mat),
1801  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
1802  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
1803  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
1804  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
1805 
1806  detail::cuda_arg<NumericT>(result),
1807  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1808  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1809  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1810  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1811  );
1812  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_mat_mul_kernel");
1813  }
1814 }
1815 
1816 template<typename DMatIndexT, typename ResultIndexT, typename NumericT >
1817 __global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int * sp_mat_coords,
1818  const NumericT * sp_mat_elements,
1819  unsigned int sp_mat_row_num,
1820  unsigned int sp_mat_col_num,
1821  unsigned int sp_mat_internal_row_num,
1822  unsigned int sp_mat_items_per_row,
1823  unsigned int sp_mat_aligned_items_per_row,
1824  const NumericT * d_mat,
1825  unsigned int d_mat_row_start,
1826  unsigned int d_mat_col_start,
1827  unsigned int d_mat_row_inc,
1828  unsigned int d_mat_col_inc,
1829  unsigned int d_mat_row_size,
1830  unsigned int d_mat_col_size,
1831  unsigned int d_mat_internal_rows,
1832  unsigned int d_mat_internal_cols,
1833  NumericT * result,
1834  unsigned int result_row_start,
1835  unsigned int result_col_start,
1836  unsigned int result_row_inc,
1837  unsigned int result_col_inc,
1838  unsigned int result_row_size,
1839  unsigned int result_col_size,
1840  unsigned int result_internal_rows,
1841  unsigned int result_internal_cols)
1842 {
1843  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
1844  unsigned int glb_sz = gridDim.x * blockDim.x;
1845 
1846  for ( unsigned int rc = glb_id; rc < (sp_mat_row_num * d_mat_row_size); rc += glb_sz)
1847  {
1848  unsigned int row = rc % sp_mat_row_num;
1849  unsigned int col = rc / sp_mat_row_num;
1850 
1851  unsigned int offset = row;
1852  NumericT r = (NumericT)0;
1853 
1854  for (unsigned int k = 0; k < sp_mat_items_per_row; k++, offset += sp_mat_internal_row_num)
1855  {
1856  unsigned int j = sp_mat_coords[offset];
1857  NumericT x = static_cast<NumericT>(sp_mat_elements[offset]);
1858 
1859  if (x != (NumericT)0)
1860  {
1861  NumericT y = d_mat[ DMatIndexT::apply(col, j,
1862  d_mat_row_start, d_mat_row_inc,
1863  d_mat_col_start, d_mat_col_inc,
1864  d_mat_internal_rows, d_mat_internal_cols) ];
1865 
1866  r += x*y;
1867  }
1868  }
1869  result [ ResultIndexT::apply(row, col,
1870  result_row_start, result_row_inc,
1871  result_col_start, result_col_inc,
1872  result_internal_rows, result_internal_cols) ] = r;
1873  }
1874 
1875 }
1876 
1886 template<typename NumericT, unsigned int AlignmentV>
1890  viennacl::op_trans > & d_mat,
1892 {
1893  if (d_mat.lhs().row_major() && result.row_major())
1894  {
1895  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1896  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1897  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1898  static_cast<unsigned int>(sp_mat.size1()),
1899  static_cast<unsigned int>(sp_mat.size2()),
1900  static_cast<unsigned int>(sp_mat.internal_size1()),
1901  static_cast<unsigned int>(sp_mat.maxnnz()),
1902  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1903 
1904  detail::cuda_arg<NumericT>(d_mat.lhs()),
1905  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1906  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1907  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1908  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1909 
1910  detail::cuda_arg<NumericT>(result),
1911  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1912  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1913  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1914  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1915  );
1916  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1917  }
1918  else if (d_mat.lhs().row_major() && !result.row_major())
1919  {
1920  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1921  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1922  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1923  static_cast<unsigned int>(sp_mat.size1()),
1924  static_cast<unsigned int>(sp_mat.size2()),
1925  static_cast<unsigned int>(sp_mat.internal_size1()),
1926  static_cast<unsigned int>(sp_mat.maxnnz()),
1927  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1928 
1929  detail::cuda_arg<NumericT>(d_mat.lhs()),
1930  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1931  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1932  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1933  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1934 
1935  detail::cuda_arg<NumericT>(result),
1936  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1937  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1938  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1939  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1940  );
1941  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1942  }
1943  else if (!d_mat.lhs().row_major() && result.row_major())
1944  {
1945  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<128, 128>>>
1946  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1947  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1948  static_cast<unsigned int>(sp_mat.size1()),
1949  static_cast<unsigned int>(sp_mat.size2()),
1950  static_cast<unsigned int>(sp_mat.internal_size1()),
1951  static_cast<unsigned int>(sp_mat.maxnnz()),
1952  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1953 
1954  detail::cuda_arg<NumericT>(d_mat.lhs()),
1955  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1956  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1957  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1958  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1959 
1960  detail::cuda_arg<NumericT>(result),
1961  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1962  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1963  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1964  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1965  );
1966  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1967  }
1968  else
1969  {
1970  ell_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<128, 128>>>
1971  (detail::cuda_arg<unsigned int>(sp_mat.handle2().cuda_handle()),
1972  detail::cuda_arg<NumericT>(sp_mat.handle().cuda_handle()),
1973  static_cast<unsigned int>(sp_mat.size1()),
1974  static_cast<unsigned int>(sp_mat.size2()),
1975  static_cast<unsigned int>(sp_mat.internal_size1()),
1976  static_cast<unsigned int>(sp_mat.maxnnz()),
1977  static_cast<unsigned int>(sp_mat.internal_maxnnz()),
1978 
1979  detail::cuda_arg<NumericT>(d_mat.lhs()),
1980  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
1981  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
1982  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
1983  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
1984 
1985  detail::cuda_arg<NumericT>(result),
1986  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
1987  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
1988  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
1989  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
1990  );
1991  VIENNACL_CUDA_LAST_ERROR_CHECK("ell_matrix_d_tr_mat_mul_kernel");
1992  }
1993 }
1994 
1995 //
1996 // SELL-C-\sigma Matrix
1997 //
1998 
1999 template<typename NumericT>
2000 __global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int * columns_per_block,
2001  const unsigned int * column_indices,
2002  const unsigned int * block_start,
2003  const NumericT * elements,
2004  const NumericT * x,
2005  unsigned int start_x,
2006  unsigned int inc_x,
2007  unsigned int size_x,
2008  NumericT * result,
2009  unsigned int start_result,
2010  unsigned int inc_result,
2011  unsigned int size_result)
2012 {
2013  unsigned int local_id = threadIdx.x;
2014  unsigned int local_size = blockDim.x;
2015  unsigned int num_rows = size_result;
2016 
2017  for (unsigned int block_idx = blockIdx.x; block_idx <= num_rows / local_size; block_idx += gridDim.x)
2018  {
2019  unsigned int row = block_idx * local_size + local_id;
2020  unsigned int offset = block_start[block_idx];
2021  unsigned int num_columns = columns_per_block[block_idx];
2022 
2023  NumericT sum = 0;
2024  for (unsigned int item_id = 0; item_id < num_columns; item_id++)
2025  {
2026  unsigned int index = offset + item_id * local_size + local_id;
2027  NumericT val = elements[index];
2028 
2029  sum += val ? (x[column_indices[index] * inc_x + start_x] * val) : 0;
2030  }
2031 
2032  if (row < num_rows)
2033  result[row * inc_result + start_result] = sum;
2034  }
2035 }
2036 
2045 template<typename NumericT, typename IndexT>
2047  const viennacl::vector_base<NumericT> & vec,
2049 {
2050  sliced_ell_matrix_vec_mul_kernel<<<128, mat.rows_per_block()>>>(detail::cuda_arg<unsigned int>(mat.handle1().cuda_handle()),
2051  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2052  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2053  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2054  detail::cuda_arg<NumericT>(vec),
2055  static_cast<unsigned int>(vec.start()),
2056  static_cast<unsigned int>(vec.stride()),
2057  static_cast<unsigned int>(vec.size()),
2058  detail::cuda_arg<NumericT>(result),
2059  static_cast<unsigned int>(result.start()),
2060  static_cast<unsigned int>(result.stride()),
2061  static_cast<unsigned int>(result.size())
2062  );
2063  VIENNACL_CUDA_LAST_ERROR_CHECK("sliced_ell_matrix_vec_mul_kernel");
2064 }
2065 
2066 
2067 //
2068 // Hybrid Matrix
2069 //
2070 
2071 
2072 template<typename NumericT>
2073 __global__ void hyb_matrix_vec_mul_kernel(const unsigned int * ell_coords,
2074  const NumericT * ell_elements,
2075  const unsigned int * csr_rows,
2076  const unsigned int * csr_cols,
2077  const NumericT * csr_elements,
2078  const NumericT * x,
2079  unsigned int start_x,
2080  unsigned int inc_x,
2081  NumericT * result,
2082  unsigned int start_result,
2083  unsigned int inc_result,
2084  unsigned int row_num,
2085  unsigned int internal_row_num,
2086  unsigned int items_per_row,
2087  unsigned int aligned_items_per_row
2088  )
2089 {
2090  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2091  unsigned int glb_sz = gridDim.x * blockDim.x;
2092 
2093  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2094  {
2095  NumericT sum = 0;
2096 
2097  unsigned int offset = row_id;
2098  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2099  {
2100  NumericT val = ell_elements[offset];
2101 
2102 
2103  if (val != NumericT(0))
2104  {
2105  int col = ell_coords[offset];
2106  sum += (x[col * inc_x + start_x] * val);
2107  }
2108  }
2109 
2110  unsigned int col_begin = csr_rows[row_id];
2111  unsigned int col_end = csr_rows[row_id + 1];
2112 
2113  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2114  sum += x[csr_cols[item_id] * inc_x + start_x] * csr_elements[item_id];
2115 
2116  result[row_id * inc_result + start_result] = sum;
2117  }
2118 }
2119 
2120 
2121 
2130 template<typename NumericT, unsigned int AlignmentV>
2132  const viennacl::vector_base<NumericT> & vec,
2134 {
2135  hyb_matrix_vec_mul_kernel<<<256, 128>>>(detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2136  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2137  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2138  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2139  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2140  detail::cuda_arg<NumericT>(vec),
2141  static_cast<unsigned int>(vec.start()),
2142  static_cast<unsigned int>(vec.stride()),
2143  detail::cuda_arg<NumericT>(result),
2144  static_cast<unsigned int>(result.start()),
2145  static_cast<unsigned int>(result.stride()),
2146  static_cast<unsigned int>(mat.size1()),
2147  static_cast<unsigned int>(mat.internal_size1()),
2148  static_cast<unsigned int>(mat.ell_nnz()),
2149  static_cast<unsigned int>(mat.internal_ellnnz())
2150  );
2151  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2152 }
2153 
2154 
2155 
2156 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2157 __global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int * ell_coords,
2158  const NumericT * ell_elements,
2159  const unsigned int * csr_rows,
2160  const unsigned int * csr_cols,
2161  const NumericT * csr_elements,
2162  unsigned int row_num,
2163  unsigned int internal_row_num,
2164  unsigned int items_per_row,
2165  unsigned int aligned_items_per_row,
2166  const NumericT * d_mat,
2167  unsigned int d_mat_row_start,
2168  unsigned int d_mat_col_start,
2169  unsigned int d_mat_row_inc,
2170  unsigned int d_mat_col_inc,
2171  unsigned int d_mat_row_size,
2172  unsigned int d_mat_col_size,
2173  unsigned int d_mat_internal_rows,
2174  unsigned int d_mat_internal_cols,
2175  NumericT * result,
2176  unsigned int result_row_start,
2177  unsigned int result_col_start,
2178  unsigned int result_row_inc,
2179  unsigned int result_col_inc,
2180  unsigned int result_row_size,
2181  unsigned int result_col_size,
2182  unsigned int result_internal_rows,
2183  unsigned int result_internal_cols)
2184 {
2185  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2186  unsigned int glb_sz = gridDim.x * blockDim.x;
2187 
2188  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2189  {
2190  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2191  {
2192  NumericT sum = 0;
2193 
2194  unsigned int offset = row_id;
2195  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2196  {
2197  NumericT val = ell_elements[offset];
2198 
2199  if (val != 0.0f)
2200  {
2201  sum += d_mat[DMatIndexT::apply(ell_coords[offset], result_col,
2202  d_mat_row_start, d_mat_row_inc,
2203  d_mat_col_start, d_mat_col_inc,
2204  d_mat_internal_rows, d_mat_internal_cols)] * val;
2205  }
2206  }
2207 
2208  unsigned int col_begin = csr_rows[row_id];
2209  unsigned int col_end = csr_rows[row_id + 1];
2210 
2211  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2212  {
2213  sum += d_mat[DMatIndexT::apply(csr_cols[item_id], result_col,
2214  d_mat_row_start, d_mat_row_inc,
2215  d_mat_col_start, d_mat_col_inc,
2216  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2217  }
2218 
2219  result[ResultIndexT::apply(row_id, result_col,
2220  result_row_start, result_row_inc,
2221  result_col_start, result_col_inc,
2222  result_internal_rows, result_internal_cols)] = sum;
2223  }
2224  }
2225 }
2226 
2227 
2228 
2237 template<typename NumericT, unsigned int AlignmentV>
2239  const viennacl::matrix_base<NumericT> & d_mat,
2241 {
2242  if (d_mat.row_major() && result.row_major())
2243  {
2244  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2245  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2246  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2247  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2248  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2249  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2250  static_cast<unsigned int>(mat.size1()),
2251  static_cast<unsigned int>(mat.internal_size1()),
2252  static_cast<unsigned int>(mat.ell_nnz()),
2253  static_cast<unsigned int>(mat.internal_ellnnz()),
2254 
2255  detail::cuda_arg<NumericT>(d_mat),
2256  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2257  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2258  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2259  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2260 
2261  detail::cuda_arg<NumericT>(result),
2262  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2263  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2264  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2265  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2266  );
2267  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2268  }
2269  else if (d_mat.row_major() && !result.row_major())
2270  {
2271  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2272  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2273  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2274  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2275  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2276  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2277  static_cast<unsigned int>(mat.size1()),
2278  static_cast<unsigned int>(mat.internal_size1()),
2279  static_cast<unsigned int>(mat.ell_nnz()),
2280  static_cast<unsigned int>(mat.internal_ellnnz()),
2281 
2282  detail::cuda_arg<NumericT>(d_mat),
2283  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2284  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2285  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2286  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2287 
2288  detail::cuda_arg<NumericT>(result),
2289  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2290  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2291  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2292  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2293  );
2294  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2295  }
2296  else if (!d_mat.row_major() && result.row_major())
2297  {
2298  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2299  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2300  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2301  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2302  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2303  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2304  static_cast<unsigned int>(mat.size1()),
2305  static_cast<unsigned int>(mat.internal_size1()),
2306  static_cast<unsigned int>(mat.ell_nnz()),
2307  static_cast<unsigned int>(mat.internal_ellnnz()),
2308 
2309  detail::cuda_arg<NumericT>(d_mat),
2310  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2311  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2312  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2313  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2314 
2315  detail::cuda_arg<NumericT>(result),
2316  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2317  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2318  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2319  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2320  );
2321  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2322  }
2323  else
2324  {
2325  hyb_matrix_d_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2326  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2327  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2328  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2329  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2330  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2331  static_cast<unsigned int>(mat.size1()),
2332  static_cast<unsigned int>(mat.internal_size1()),
2333  static_cast<unsigned int>(mat.ell_nnz()),
2334  static_cast<unsigned int>(mat.internal_ellnnz()),
2335 
2336  detail::cuda_arg<NumericT>(d_mat),
2337  static_cast<unsigned int>(viennacl::traits::start1(d_mat)), static_cast<unsigned int>(viennacl::traits::start2(d_mat)),
2338  static_cast<unsigned int>(viennacl::traits::stride1(d_mat)), static_cast<unsigned int>(viennacl::traits::stride2(d_mat)),
2339  static_cast<unsigned int>(viennacl::traits::size1(d_mat)), static_cast<unsigned int>(viennacl::traits::size2(d_mat)),
2340  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat)), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat)),
2341 
2342  detail::cuda_arg<NumericT>(result),
2343  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2344  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2345  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2346  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2347  );
2348  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2349  }
2350 }
2351 
2352 
2353 
2354 template<typename DMatIndexT, typename ResultIndexT, typename NumericT>
2355 __global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int * ell_coords,
2356  const NumericT * ell_elements,
2357  const unsigned int * csr_rows,
2358  const unsigned int * csr_cols,
2359  const NumericT * csr_elements,
2360  unsigned int row_num,
2361  unsigned int internal_row_num,
2362  unsigned int items_per_row,
2363  unsigned int aligned_items_per_row,
2364  const NumericT * d_mat,
2365  unsigned int d_mat_row_start,
2366  unsigned int d_mat_col_start,
2367  unsigned int d_mat_row_inc,
2368  unsigned int d_mat_col_inc,
2369  unsigned int d_mat_row_size,
2370  unsigned int d_mat_col_size,
2371  unsigned int d_mat_internal_rows,
2372  unsigned int d_mat_internal_cols,
2373  NumericT * result,
2374  unsigned int result_row_start,
2375  unsigned int result_col_start,
2376  unsigned int result_row_inc,
2377  unsigned int result_col_inc,
2378  unsigned int result_row_size,
2379  unsigned int result_col_size,
2380  unsigned int result_internal_rows,
2381  unsigned int result_internal_cols)
2382 {
2383  unsigned int glb_id = blockDim.x * blockIdx.x + threadIdx.x;
2384  unsigned int glb_sz = gridDim.x * blockDim.x;
2385 
2386  for (unsigned int result_col = 0; result_col < result_col_size; ++result_col)
2387  {
2388  for (unsigned int row_id = glb_id; row_id < row_num; row_id += glb_sz)
2389  {
2390  NumericT sum = 0;
2391 
2392  unsigned int offset = row_id;
2393  for (unsigned int item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num)
2394  {
2395  NumericT val = ell_elements[offset];
2396 
2397  if (val != 0.0f)
2398  {
2399  sum += d_mat[DMatIndexT::apply(result_col, ell_coords[offset],
2400  d_mat_row_start, d_mat_row_inc,
2401  d_mat_col_start, d_mat_col_inc,
2402  d_mat_internal_rows, d_mat_internal_cols)] * val;
2403  }
2404  }
2405 
2406  unsigned int col_begin = csr_rows[row_id];
2407  unsigned int col_end = csr_rows[row_id + 1];
2408 
2409  for (unsigned int item_id = col_begin; item_id < col_end; item_id++)
2410  {
2411  sum += d_mat[DMatIndexT::apply(result_col, csr_cols[item_id],
2412  d_mat_row_start, d_mat_row_inc,
2413  d_mat_col_start, d_mat_col_inc,
2414  d_mat_internal_rows, d_mat_internal_cols)] * csr_elements[item_id];
2415  }
2416 
2417  result[ResultIndexT::apply(row_id, result_col,
2418  result_row_start, result_row_inc,
2419  result_col_start, result_col_inc,
2420  result_internal_rows, result_internal_cols)] = sum;
2421  }
2422  }
2423 }
2424 
2425 
2426 
2435 template<typename NumericT, unsigned int AlignmentV>
2439  viennacl::op_trans > & d_mat,
2441 {
2442  if (d_mat.lhs().row_major() && result.row_major())
2443  {
2444  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2445  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2446  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2447  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2448  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2449  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2450  static_cast<unsigned int>(mat.size1()),
2451  static_cast<unsigned int>(mat.internal_size1()),
2452  static_cast<unsigned int>(mat.ell_nnz()),
2453  static_cast<unsigned int>(mat.internal_ellnnz()),
2454 
2455  detail::cuda_arg<NumericT>(d_mat.lhs()),
2456  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2457  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2458  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2459  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2460 
2461  detail::cuda_arg<NumericT>(result),
2462  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2463  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2464  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2465  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2466  );
2467  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2468  }
2469  else if (d_mat.lhs().row_major() && !result.row_major())
2470  {
2471  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<row_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2472  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2473  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2474  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2475  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2476  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2477  static_cast<unsigned int>(mat.size1()),
2478  static_cast<unsigned int>(mat.internal_size1()),
2479  static_cast<unsigned int>(mat.ell_nnz()),
2480  static_cast<unsigned int>(mat.internal_ellnnz()),
2481 
2482  detail::cuda_arg<NumericT>(d_mat.lhs()),
2483  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2484  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2485  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2486  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2487 
2488  detail::cuda_arg<NumericT>(result),
2489  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2490  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2491  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2492  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2493  );
2494  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2495  }
2496  else if (!d_mat.lhs().row_major() && result.row_major())
2497  {
2498  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<row_major> ><<<256, 128>>>(
2499  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2500  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2501  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2502  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2503  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2504  static_cast<unsigned int>(mat.size1()),
2505  static_cast<unsigned int>(mat.internal_size1()),
2506  static_cast<unsigned int>(mat.ell_nnz()),
2507  static_cast<unsigned int>(mat.internal_ellnnz()),
2508 
2509  detail::cuda_arg<NumericT>(d_mat.lhs()),
2510  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2511  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2512  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2513  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2514 
2515  detail::cuda_arg<NumericT>(result),
2516  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2517  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2518  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2519  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2520  );
2521  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2522  }
2523  else
2524  {
2525  hyb_matrix_d_tr_mat_mul_kernel<mat_mult_matrix_index<column_major>, mat_mult_matrix_index<column_major> ><<<256, 128>>>(
2526  detail::cuda_arg<unsigned int>(mat.handle2().cuda_handle()),
2527  detail::cuda_arg<NumericT>(mat.handle().cuda_handle()),
2528  detail::cuda_arg<unsigned int>(mat.handle3().cuda_handle()),
2529  detail::cuda_arg<unsigned int>(mat.handle4().cuda_handle()),
2530  detail::cuda_arg<NumericT>(mat.handle5().cuda_handle()),
2531  static_cast<unsigned int>(mat.size1()),
2532  static_cast<unsigned int>(mat.internal_size1()),
2533  static_cast<unsigned int>(mat.ell_nnz()),
2534  static_cast<unsigned int>(mat.internal_ellnnz()),
2535 
2536  detail::cuda_arg<NumericT>(d_mat.lhs()),
2537  static_cast<unsigned int>(viennacl::traits::start1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::start2(d_mat.lhs())),
2538  static_cast<unsigned int>(viennacl::traits::stride1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::stride2(d_mat.lhs())),
2539  static_cast<unsigned int>(viennacl::traits::size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::size2(d_mat.lhs())),
2540  static_cast<unsigned int>(viennacl::traits::internal_size1(d_mat.lhs())), static_cast<unsigned int>(viennacl::traits::internal_size2(d_mat.lhs())),
2541 
2542  detail::cuda_arg<NumericT>(result),
2543  static_cast<unsigned int>(viennacl::traits::start1(result)), static_cast<unsigned int>(viennacl::traits::start2(result)),
2544  static_cast<unsigned int>(viennacl::traits::stride1(result)), static_cast<unsigned int>(viennacl::traits::stride2(result)),
2545  static_cast<unsigned int>(viennacl::traits::size1(result)), static_cast<unsigned int>(viennacl::traits::size2(result)),
2546  static_cast<unsigned int>(viennacl::traits::internal_size1(result)), static_cast<unsigned int>(viennacl::traits::internal_size2(result))
2547  );
2548  VIENNACL_CUDA_LAST_ERROR_CHECK("hyb_matrix_vec_mul_kernel");
2549  }
2550 }
2551 
2552 
2553 } // namespace cuda
2554 } //namespace linalg
2555 } //namespace viennacl
2556 
2557 
2558 #endif
vcl_size_t internal_ellnnz() const
Definition: hyb_matrix.hpp:101
Sparse matrix class using a hybrid format composed of the ELL and CSR format for storing the nonzeros...
Definition: forwards.h:405
Simple enable-if variant that uses the SFINAE pattern.
Definition: enable_if.hpp:30
vcl_size_t size1() const
Definition: ell_matrix.hpp:91
void inplace_solve(matrix_base< NumericT > const &A, matrix_base< NumericT > &B, SolverTagT tag)
Direct inplace solver for triangular systems with multiple right hand sides, i.e. A \ B (MATLAB notat...
handle_type & handle2()
Definition: ell_matrix.hpp:103
result_of::size_type< matrix_base< NumericT > >::type stride1(matrix_base< NumericT > const &s)
Definition: stride.hpp:55
__global__ void hyb_matrix_vec_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
__global__ void compressed_matrix_vec_mul_adaptive_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const unsigned int *row_blocks, const NumericT *elements, unsigned int num_blocks, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
const handle_type & handle3() const
Definition: hyb_matrix.hpp:107
const vcl_size_t & size1() const
Returns the number of rows.
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Various little tools used here and there in ViennaCL.
const handle_type & handle() const
Definition: hyb_matrix.hpp:105
vcl_size_t internal_size1(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per row of a ViennaCL matrix...
Definition: size.hpp:279
const handle_type & handle12() const
Returns the OpenCL handle to the (row, column) index array.
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
A tag class representing a lower triangular matrix.
Definition: forwards.h:809
__global__ void coordinate_matrix_d_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t internal_size1() const
Definition: hyb_matrix.hpp:95
vcl_size_t internal_size2(matrix_base< NumericT > const &mat)
Helper routine for obtaining the internal number of entries per column of a ViennaCL matrix...
Definition: size.hpp:287
Expression template class for representing a tree of expressions which ultimately result in a matrix...
Definition: forwards.h:340
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
This file provides the forward declarations for the main types used within ViennaCL.
vcl_size_t size2() const
Definition: ell_matrix.hpp:92
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
void row_info(compressed_matrix< NumericT, AligmentV > const &mat, vector_base< NumericT > &vec, viennacl::linalg::detail::row_info_types info_selector)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
const handle_type & handle4() const
Definition: hyb_matrix.hpp:108
vcl_size_t rows_per_block() const
void prod_impl(const matrix_base< NumericT > &mat, bool mat_transpose, const vector_base< NumericT > &vec, vector_base< NumericT > &result)
Carries out matrix-vector multiplication.
__global__ void compressed_matrix_diagonal_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void sliced_ell_matrix_vec_mul_kernel(const unsigned int *columns_per_block, const unsigned int *column_indices, const unsigned int *block_start, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, unsigned int size_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
statement sum(scalar< NumericT > const *s, vector_base< NumericT > const *x)
Definition: preset.hpp:246
vcl_size_t size1() const
Returns the size of the result vector.
Definition: matrix.hpp:72
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
const handle_type & handle1() const
Returns the OpenCL handle to the row index array.
Helper struct for accessing an element of a row- or column-major matrix.
vcl_size_t internal_size1() const
Definition: ell_matrix.hpp:88
__global__ void csr_row_info_extractor_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, NumericT *result, unsigned int size, unsigned int option)
size_type stride() const
Returns the stride within the buffer (in multiples of sizeof(NumericT))
Definition: vector_def.hpp:124
const handle_type & handle2() const
Definition: hyb_matrix.hpp:106
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void coo_row_info_extractor(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, NumericT *result, unsigned int option)
__global__ void hyb_matrix_d_tr_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
Sparse matrix class using the ELLPACK format for storing the nonzeros.
Definition: ell_matrix.hpp:53
A tag class representing an upper triangular matrix.
Definition: forwards.h:814
__global__ void compressed_compressed_matrix_vec_mul_kernel(const unsigned int *row_jumper, const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, unsigned int nonzero_rows, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
Sparse matrix class using the sliced ELLPACK with parameters C, .
Definition: forwards.h:402
__global__ void ell_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
const handle_type & handle3() const
Returns the OpenCL handle to the row index array.
__global__ void compressed_matrix_vec_mul_kernel(const unsigned int *row_indices, const unsigned int *column_indices, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int size_result)
A sparse square matrix in compressed sparse rows format optimized for the case that only a few rows c...
const handle_type & handle2() const
Returns the OpenCL handle to the column index array.
__global__ void hyb_matrix_d_mat_mul_kernel(const unsigned int *ell_coords, const NumericT *ell_elements, const unsigned int *csr_rows, const unsigned int *csr_cols, const NumericT *csr_elements, unsigned int row_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
std::size_t vcl_size_t
Definition: forwards.h:74
vector_expression< const matrix_base< NumericT >, const int, op_matrix_diag > diag(const matrix_base< NumericT > &A, int k=0)
Definition: matrix.hpp:838
static __device__ unsigned int apply(unsigned int i, unsigned int j, unsigned int row_start, unsigned int row_inc, unsigned int col_start, unsigned int col_inc, unsigned int internal_rows, unsigned int internal_cols)
vcl_size_t maxnnz() const
Definition: ell_matrix.hpp:95
void block_inplace_solve(const matrix_expression< const compressed_matrix< NumericT, AlignmentV >, const compressed_matrix< NumericT, AlignmentV >, op_trans > &L, viennacl::backend::mem_handle const &block_indices, vcl_size_t num_blocks, vector_base< NumericT > const &, vector_base< NumericT > &vec, viennacl::linalg::unit_lower_tag)
result_of::size_type< matrix_base< NumericT > >::type stride2(matrix_base< NumericT > const &s)
Definition: stride.hpp:65
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
const handle_type & handle3() const
Returns the OpenCL handle to the group start index array.
Implementations of direct triangular solvers for sparse matrices using CUDA.
handle_type & handle()
Definition: ell_matrix.hpp:100
__global__ void ell_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result, unsigned int row_num, unsigned int col_num, unsigned int internal_row_num, unsigned int items_per_row, unsigned int aligned_items_per_row)
const handle_type & handle3() const
Returns the OpenCL handle to the row block array.
__global__ void coordinate_matrix_vec_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *x, unsigned int start_x, unsigned int inc_x, NumericT *result, unsigned int start_result, unsigned int inc_result)
void clear()
Resets all entries to zero. Does not change the size of the vector.
Definition: vector.hpp:861
Common routines for CUDA execution.
const handle_type & handle() const
Returns the OpenCL handle to the matrix entry array.
__global__ void ell_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_coords, const NumericT *sp_mat_elements, unsigned int sp_mat_row_num, unsigned int sp_mat_col_num, unsigned int sp_mat_internal_row_num, unsigned int sp_mat_items_per_row, unsigned int sp_mat_aligned_items_per_row, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
bool row_major() const
Definition: matrix_def.hpp:239
NumericT max(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:47
__global__ void coordinate_matrix_d_tr_mat_mul_kernel(const unsigned int *coords, const NumericT *elements, const unsigned int *group_boundaries, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
const vcl_size_t & nnz1() const
Returns the number of nonzero entries.
vcl_size_t ell_nnz() const
Definition: hyb_matrix.hpp:102
A tag class representing a lower triangular matrix with unit diagonal.
Definition: forwards.h:819
Main abstraction class for multiple memory domains. Represents a buffer in either main RAM...
Definition: mem_handle.hpp:89
__global__ void compressed_matrix_d_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)
A tag class representing transposed matrices.
Definition: forwards.h:219
A sparse square matrix in compressed sparse rows format.
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
A tag for column-major storage of a dense matrix.
Definition: forwards.h:320
const handle_type & handle5() const
Definition: hyb_matrix.hpp:109
vcl_size_t size1() const
Definition: hyb_matrix.hpp:98
LHS & lhs() const
Get left hand side operand.
Definition: matrix.hpp:66
size_type start() const
Returns the offset within the buffer.
Definition: vector_def.hpp:122
const vcl_size_t & blocks1() const
Returns the internal number of row blocks for an adaptive SpMV.
vcl_size_t internal_maxnnz() const
Definition: ell_matrix.hpp:94
Implementation of the ViennaCL scalar class.
const handle_type & handle() const
Returns the memory handle.
Definition: vector_def.hpp:128
A tag class representing an upper triangular matrix with unit diagonal.
Definition: forwards.h:824
A sparse square matrix, where entries are stored as triplets (i,j, val), where i and j are the row an...
__global__ void compressed_matrix_d_tr_mat_mul_kernel(const unsigned int *sp_mat_row_indices, const unsigned int *sp_mat_col_indices, const NumericT *sp_mat_elements, const NumericT *d_mat, unsigned int d_mat_row_start, unsigned int d_mat_col_start, unsigned int d_mat_row_inc, unsigned int d_mat_col_inc, unsigned int d_mat_row_size, unsigned int d_mat_col_size, unsigned int d_mat_internal_rows, unsigned int d_mat_internal_cols, NumericT *result, unsigned int result_row_start, unsigned int result_col_start, unsigned int result_row_inc, unsigned int result_col_inc, unsigned int result_row_size, unsigned int result_col_size, unsigned int result_internal_rows, unsigned int result_internal_cols)