ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
matrix_operations_col.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
2 #define VIENNACL_LINALG_CUDA_MATRIX_OPERATIONS_COL_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
26 namespace viennacl
27 {
28 namespace linalg
29 {
30 namespace cuda
31 {
32 //
33 // am
34 //
35 
36 // alpha on CPU
37 template<typename NumericT>
38 __global__ void am_col_kernel(NumericT * A,
39  unsigned int A_start1, unsigned int A_start2,
40  unsigned int A_inc1, unsigned int A_inc2,
41  unsigned int A_size1, unsigned int A_size2,
42  unsigned int A_internal_size1, unsigned int A_internal_size2,
43 
44  NumericT fac2,
45  unsigned int options2,
46  const NumericT * B,
47  unsigned int B_start1, unsigned int B_start2,
48  unsigned int B_inc1, unsigned int B_inc2,
49  unsigned int B_internal_size1, unsigned int B_internal_size2)
50 {
51  NumericT alpha = fac2;
52  if (options2 & (1 << 0))
53  alpha = -alpha;
54 
55  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
56  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
57 
58  if (options2 & (1 << 1))
59  {
60  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
61  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
62  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
63  }
64  else
65  {
66  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
67  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
68  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
69  }
70 }
71 
72 // alpha on GPU
73 template<typename NumericT>
74 __global__ void am_col_kernel(NumericT * A,
75  unsigned int A_start1, unsigned int A_start2,
76  unsigned int A_inc1, unsigned int A_inc2,
77  unsigned int A_size1, unsigned int A_size2,
78  unsigned int A_internal_size1, unsigned int A_internal_size2,
79 
80  const NumericT * fac2,
81  unsigned int options2,
82  const NumericT * B,
83  unsigned int B_start1, unsigned int B_start2,
84  unsigned int B_inc1, unsigned int B_inc2,
85  unsigned int B_internal_size1, unsigned int B_internal_size2)
86 {
87  NumericT alpha = *fac2;
88  if (options2 & (1 << 0))
89  alpha = -alpha;
90 
91  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
92  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
93 
94  if (options2 & (1 << 1))
95  {
96  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
97  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
98  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha;
99  }
100  else
101  {
102  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
103  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
104  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha;
105  }
106 }
107 
108 
109 //
110 // ambm
111 //
112 
113 // alpha and beta on CPU
114 template<typename NumericT>
115 __global__ void ambm_col_kernel(NumericT * A,
116  unsigned int A_start1, unsigned int A_start2,
117  unsigned int A_inc1, unsigned int A_inc2,
118  unsigned int A_size1, unsigned int A_size2,
119  unsigned int A_internal_size1, unsigned int A_internal_size2,
120 
121  NumericT fac2,
122  unsigned int options2,
123  const NumericT * B,
124  unsigned int B_start1, unsigned int B_start2,
125  unsigned int B_inc1, unsigned int B_inc2,
126  unsigned int B_internal_size1, unsigned int B_internal_size2,
127 
128  NumericT fac3,
129  unsigned int options3,
130  const NumericT * C,
131  unsigned int C_start1, unsigned int C_start2,
132  unsigned int C_inc1, unsigned int C_inc2,
133  unsigned int C_internal_size1, unsigned int C_internal_size2)
134 {
135  NumericT alpha = fac2;
136  if (options2 & (1 << 0))
137  alpha = -alpha;
138 
139  NumericT beta = fac3;
140  if (options3 & (1 << 0))
141  beta = -beta;
142 
143  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
144  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
145 
146  if (options2 & (1 << 1))
147  {
148  if (options3 & (1 << 1))
149  {
150  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
151  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
152  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
153  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
154  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
155  }
156  else
157  {
158  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
159  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
160  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
161  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
162  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
163  }
164  }
165  else
166  {
167  if (options3 & (1 << 1))
168  {
169  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
170  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
171  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
172  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
173  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
174  }
175  else
176  {
177  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
178  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
179  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
180  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
181  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
182  }
183  }
184 }
185 
186 
187 // alpha on CPU, beta on GPU
188 template<typename NumericT>
189 __global__ void ambm_col_kernel(NumericT * A,
190  unsigned int A_start1, unsigned int A_start2,
191  unsigned int A_inc1, unsigned int A_inc2,
192  unsigned int A_size1, unsigned int A_size2,
193  unsigned int A_internal_size1, unsigned int A_internal_size2,
194 
195  NumericT fac2,
196  unsigned int options2,
197  const NumericT * B,
198  unsigned int B_start1, unsigned int B_start2,
199  unsigned int B_inc1, unsigned int B_inc2,
200  unsigned int B_internal_size1, unsigned int B_internal_size2,
201 
202  const NumericT * fac3,
203  unsigned int options3,
204  const NumericT * C,
205  unsigned int C_start1, unsigned int C_start2,
206  unsigned int C_inc1, unsigned int C_inc2,
207  unsigned int C_internal_size1, unsigned int C_internal_size2)
208 {
209  NumericT alpha = fac2;
210  if (options2 & (1 << 0))
211  alpha = -alpha;
212 
213  NumericT beta = *fac3;
214  if (options3 & (1 << 0))
215  beta = -beta;
216 
217  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
218  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
219 
220  if (options2 & (1 << 1))
221  {
222  if (options3 & (1 << 1))
223  {
224  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
225  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
226  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
227  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
228  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
229  }
230  else
231  {
232  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
233  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
234  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
235  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
236  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
237  }
238  }
239  else
240  {
241  if (options3 & (1 << 1))
242  {
243  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
244  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
245  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
246  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
247  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
248  }
249  else
250  {
251  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
252  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
253  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
254  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
255  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
256  }
257  }
258 }
259 
260 // alpha on GPU, beta on CPU
261 template<typename NumericT>
262 __global__ void ambm_col_kernel(NumericT * A,
263  unsigned int A_start1, unsigned int A_start2,
264  unsigned int A_inc1, unsigned int A_inc2,
265  unsigned int A_size1, unsigned int A_size2,
266  unsigned int A_internal_size1, unsigned int A_internal_size2,
267 
268  const NumericT * fac2,
269  unsigned int options2,
270  const NumericT * B,
271  unsigned int B_start1, unsigned int B_start2,
272  unsigned int B_inc1, unsigned int B_inc2,
273  unsigned int B_internal_size1, unsigned int B_internal_size2,
274 
275  NumericT fac3,
276  unsigned int options3,
277  const NumericT * C,
278  unsigned int C_start1, unsigned int C_start2,
279  unsigned int C_inc1, unsigned int C_inc2,
280  unsigned int C_internal_size1, unsigned int C_internal_size2)
281 {
282  NumericT alpha = *fac2;
283  if (options2 & (1 << 0))
284  alpha = -alpha;
285 
286  NumericT beta = fac3;
287  if (options3 & (1 << 0))
288  beta = -beta;
289 
290  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
291  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
292 
293  if (options2 & (1 << 1))
294  {
295  if (options3 & (1 << 1))
296  {
297  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
298  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
299  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
300  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
301  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
302  }
303  else
304  {
305  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
306  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
307  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
308  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
309  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
310  }
311  }
312  else
313  {
314  if (options3 & (1 << 1))
315  {
316  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
317  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
318  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
319  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
320  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
321  }
322  else
323  {
324  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
325  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
326  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
327  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
328  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
329  }
330  }
331 }
332 
333 
334 // alpha and beta on GPU
335 template<typename NumericT>
336 __global__ void ambm_col_kernel(
337  NumericT * A,
338  unsigned int A_start1, unsigned int A_start2,
339  unsigned int A_inc1, unsigned int A_inc2,
340  unsigned int A_size1, unsigned int A_size2,
341  unsigned int A_internal_size1, unsigned int A_internal_size2,
342 
343  const NumericT * fac2,
344  unsigned int options2,
345  const NumericT * B,
346  unsigned int B_start1, unsigned int B_start2,
347  unsigned int B_inc1, unsigned int B_inc2,
348  unsigned int B_internal_size1, unsigned int B_internal_size2,
349 
350  const NumericT * fac3,
351  unsigned int options3,
352  const NumericT * C,
353  unsigned int C_start1, unsigned int C_start2,
354  unsigned int C_inc1, unsigned int C_inc2,
355  unsigned int C_internal_size1, unsigned int C_internal_size2)
356 {
357  NumericT alpha = *fac2;
358  if (options2 & (1 << 0))
359  alpha = -alpha;
360 
361  NumericT beta = *fac3;
362  if (options3 & (1 << 0))
363  beta = -beta;
364 
365  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
366  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
367 
368  if (options2 & (1 << 1))
369  {
370  if (options3 & (1 << 1))
371  {
372  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
373  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
374  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
375  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
376  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
377  }
378  else
379  {
380  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
381  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
382  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
383  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
384  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
385  }
386  }
387  else
388  {
389  if (options3 & (1 << 1))
390  {
391  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
392  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
393  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
394  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
395  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
396  }
397  else
398  {
399  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
400  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
401  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
402  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
403  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
404  }
405  }
406 }
407 
408 
409 //
410 // ambm_m
411 //
412 
413 // alpha and beta on CPU
414 template<typename NumericT>
415 __global__ void ambm_m_col_kernel(
416  NumericT * A,
417  unsigned int A_start1, unsigned int A_start2,
418  unsigned int A_inc1, unsigned int A_inc2,
419  unsigned int A_size1, unsigned int A_size2,
420  unsigned int A_internal_size1, unsigned int A_internal_size2,
421 
422  NumericT fac2,
423  unsigned int options2,
424  const NumericT * B,
425  unsigned int B_start1, unsigned int B_start2,
426  unsigned int B_inc1, unsigned int B_inc2,
427  unsigned int B_internal_size1, unsigned int B_internal_size2,
428 
429  NumericT fac3,
430  unsigned int options3,
431  const NumericT * C,
432  unsigned int C_start1, unsigned int C_start2,
433  unsigned int C_inc1, unsigned int C_inc2,
434  unsigned int C_internal_size1, unsigned int C_internal_size2)
435 {
436  NumericT alpha = fac2;
437  if (options2 & (1 << 0))
438  alpha = -alpha;
439 
440  NumericT beta = fac3;
441  if (options3 & (1 << 0))
442  beta = -beta;
443 
444  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
445  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
446 
447  if (options2 & (1 << 1))
448  {
449  if (options3 & (1 << 1))
450  {
451  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
452  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
453  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
454  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
455  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
456  }
457  else
458  {
459  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
460  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
461  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
462  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
463  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
464  }
465  }
466  else
467  {
468  if (options3 & (1 << 1))
469  {
470  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
471  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
472  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
473  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
474  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
475  }
476  else
477  {
478  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
479  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
480  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
481  += B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
482  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
483  }
484  }
485 }
486 
487 
488 // alpha on CPU, beta on GPU
489 template<typename NumericT>
490 __global__ void ambm_m_col_kernel(
491  NumericT * A,
492  unsigned int A_start1, unsigned int A_start2,
493  unsigned int A_inc1, unsigned int A_inc2,
494  unsigned int A_size1, unsigned int A_size2,
495  unsigned int A_internal_size1, unsigned int A_internal_size2,
496 
497  NumericT fac2,
498  unsigned int options2,
499  const NumericT * B,
500  unsigned int B_start1, unsigned int B_start2,
501  unsigned int B_inc1, unsigned int B_inc2,
502  unsigned int B_internal_size1, unsigned int B_internal_size2,
503 
504  const NumericT * fac3,
505  unsigned int options3,
506  const NumericT * C,
507  unsigned int C_start1, unsigned int C_start2,
508  unsigned int C_inc1, unsigned int C_inc2,
509  unsigned int C_internal_size1, unsigned int C_internal_size2)
510 {
511  NumericT alpha = fac2;
512  if (options2 & (1 << 0))
513  alpha = -alpha;
514 
515  NumericT beta = *fac3;
516  if (options3 & (1 << 0))
517  beta = -beta;
518 
519  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
520  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
521 
522  if (options2 & (1 << 1))
523  {
524  if (options3 & (1 << 1))
525  {
526  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
527  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
528  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
529  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
530  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
531  }
532  else
533  {
534  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
535  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
536  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
537  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
538  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
539  }
540  }
541  else
542  {
543  if (options3 & (1 << 1))
544  {
545  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
546  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
547  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
548  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
549  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
550  }
551  else
552  {
553  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
554  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
555  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
556  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
557  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
558  }
559  }
560 }
561 
562 // alpha on GPU, beta on CPU
563 template<typename NumericT>
564 __global__ void ambm_m_col_kernel(
565  NumericT * A,
566  unsigned int A_start1, unsigned int A_start2,
567  unsigned int A_inc1, unsigned int A_inc2,
568  unsigned int A_size1, unsigned int A_size2,
569  unsigned int A_internal_size1, unsigned int A_internal_size2,
570 
571  const NumericT * fac2,
572  unsigned int options2,
573  const NumericT * B,
574  unsigned int B_start1, unsigned int B_start2,
575  unsigned int B_inc1, unsigned int B_inc2,
576  unsigned int B_internal_size1, unsigned int B_internal_size2,
577 
578  NumericT fac3,
579  unsigned int options3,
580  const NumericT * C,
581  unsigned int C_start1, unsigned int C_start2,
582  unsigned int C_inc1, unsigned int C_inc2,
583  unsigned int C_internal_size1, unsigned int C_internal_size2)
584 {
585  NumericT alpha = *fac2;
586  if (options2 & (1 << 0))
587  alpha = -alpha;
588 
589  NumericT beta = fac3;
590  if (options3 & (1 << 0))
591  beta = -beta;
592 
593  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
594  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
595 
596  if (options2 & (1 << 1))
597  {
598  if (options3 & (1 << 1))
599  {
600  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
601  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
602  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
603  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
604  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
605  }
606  else
607  {
608  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
609  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
610  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
611  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
612  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
613  }
614  }
615  else
616  {
617  if (options3 & (1 << 1))
618  {
619  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
620  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
621  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
622  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
623  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
624  }
625  else
626  {
627  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
628  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
629  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
630  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
631  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
632  }
633  }
634 }
635 
636 
637 // alpha and beta on GPU
638 template<typename NumericT>
639 __global__ void ambm_m_col_kernel(
640  NumericT * A,
641  unsigned int A_start1, unsigned int A_start2,
642  unsigned int A_inc1, unsigned int A_inc2,
643  unsigned int A_size1, unsigned int A_size2,
644  unsigned int A_internal_size1, unsigned int A_internal_size2,
645 
646  const NumericT * fac2,
647  unsigned int options2,
648  const NumericT * B,
649  unsigned int B_start1, unsigned int B_start2,
650  unsigned int B_inc1, unsigned int B_inc2,
651  unsigned int B_internal_size1, unsigned int B_internal_size2,
652 
653  const NumericT * fac3,
654  unsigned int options3,
655  const NumericT * C,
656  unsigned int C_start1, unsigned int C_start2,
657  unsigned int C_inc1, unsigned int C_inc2,
658  unsigned int C_internal_size1, unsigned int C_internal_size2)
659 {
660  NumericT alpha = *fac2;
661  if (options2 & (1 << 0))
662  alpha = -alpha;
663 
664  NumericT beta = *fac3;
665  if (options3 & (1 << 0))
666  beta = -beta;
667 
668  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
669  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
670 
671  if (options2 & (1 << 1))
672  {
673  if (options3 & (1 << 1))
674  {
675  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
676  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
677  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
678  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
679  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
680  }
681  else
682  {
683  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
684  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
685  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
686  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] / alpha
687  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
688  }
689  }
690  else
691  {
692  if (options3 & (1 << 1))
693  {
694  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
695  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
696  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
697  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
698  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] / beta;
699  }
700  else
701  {
702  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
703  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
704  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
705  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] * alpha
706  + C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] * beta;
707  }
708  }
709 }
710 
711 
712 
713 //
714 // assignments
715 //
716 
717 template<typename NumericT>
718 __global__ void matrix_col_assign_kernel(
719  NumericT * A,
720  unsigned int A_start1, unsigned int A_start2,
721  unsigned int A_inc1, unsigned int A_inc2,
722  unsigned int A_size1, unsigned int A_size2,
723  unsigned int A_internal_size1, unsigned int A_internal_size2,
724  NumericT alpha)
725 {
726  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
727  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
728 
729  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
730  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
731  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = alpha;
732 }
733 
734 
735 template<typename NumericT>
737  NumericT * A,
738  unsigned int A_start1, unsigned int A_start2,
739  unsigned int A_inc1, unsigned int A_inc2,
740  unsigned int A_size1, unsigned int A_size2,
741  unsigned int A_internal_size1, unsigned int A_internal_size2,
742  NumericT alpha)
743 {
744  unsigned int gid = (blockIdx.x * blockDim.x + threadIdx.x);
745 
746  for (unsigned int row = gid; row < A_size1; row += blockDim.x * gridDim.x)
747  A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1] = alpha;
748 }
749 
750 //
751 // binary element-wise operations
752 //
753 
754 template<typename NumericT>
755 __global__ void element_op_col_kernel(
756  NumericT * A,
757  unsigned int A_start1, unsigned int A_start2,
758  unsigned int A_inc1, unsigned int A_inc2,
759  unsigned int A_size1, unsigned int A_size2,
760  unsigned int A_internal_size1, unsigned int A_internal_size2,
761 
762  const NumericT * B,
763  unsigned int B_start1, unsigned int B_start2,
764  unsigned int B_inc1, unsigned int B_inc2,
765  unsigned int B_internal_size1, unsigned int B_internal_size2,
766 
767  const NumericT * C,
768  unsigned int C_start1, unsigned int C_start2,
769  unsigned int C_inc1, unsigned int C_inc2,
770  unsigned int C_internal_size1, unsigned int C_internal_size2,
771 
772  unsigned int op_type) //0: product, 1: division, 2: pow
773 {
774  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
775  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
776 
777  if (op_type == 2)
778  {
779  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
780  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
781  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
782  = pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1],
783  C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1]);
784  }
785  else if (op_type == 1)
786  {
787  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
788  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
789  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
790  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
791  / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
792  }
793  else if (op_type == 0)
794  {
795  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
796  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
797  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
798  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
799  * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
800  }
801 }
802 
803 template<typename NumericT>
805  NumericT * A,
806  unsigned int A_start1, unsigned int A_start2,
807  unsigned int A_inc1, unsigned int A_inc2,
808  unsigned int A_size1, unsigned int A_size2,
809  unsigned int A_internal_size1, unsigned int A_internal_size2,
810 
811  const NumericT * B,
812  unsigned int B_start1, unsigned int B_start2,
813  unsigned int B_inc1, unsigned int B_inc2,
814  unsigned int B_internal_size1, unsigned int B_internal_size2,
815 
816  const NumericT * C,
817  unsigned int C_start1, unsigned int C_start2,
818  unsigned int C_inc1, unsigned int C_inc2,
819  unsigned int C_internal_size1, unsigned int C_internal_size2,
820 
821  unsigned int op_type) //0: product, 1: division, 2: pow
822 {
823  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
824  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
825 
826  if (op_type == 1)
827  {
828  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
829  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
830  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
831  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
832  / C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
833  }
834  else if (op_type == 0)
835  {
836  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
837  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
838  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1]
839  = B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]
840  * C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1];
841  }
842 }
843 
844 
845 //
846 // unary element-wise operations
847 //
848 
849 // abs
850 template<typename NumericT>
852  NumericT * A,
853  unsigned int A_start1, unsigned int A_start2,
854  unsigned int A_inc1, unsigned int A_inc2,
855  unsigned int A_size1, unsigned int A_size2,
856  unsigned int A_internal_size1, unsigned int A_internal_size2,
857 
858  const NumericT * B,
859  unsigned int B_start1, unsigned int B_start2,
860  unsigned int B_inc1, unsigned int B_inc2,
861  unsigned int B_internal_size1, unsigned int B_internal_size2)
862 {
863  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
864  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
865 
866  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
867  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
868  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = abs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
869 }
870 
871 
872 // acos
873 template<typename NumericT>
875  NumericT * A,
876  unsigned int A_start1, unsigned int A_start2,
877  unsigned int A_inc1, unsigned int A_inc2,
878  unsigned int A_size1, unsigned int A_size2,
879  unsigned int A_internal_size1, unsigned int A_internal_size2,
880 
881  const NumericT * B,
882  unsigned int B_start1, unsigned int B_start2,
883  unsigned int B_inc1, unsigned int B_inc2,
884  unsigned int B_internal_size1, unsigned int B_internal_size2)
885 {
886  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
887  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
888 
889  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
890  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
891  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = acos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
892 }
893 
894 
895 // asin
896 template<typename NumericT>
898  NumericT * A,
899  unsigned int A_start1, unsigned int A_start2,
900  unsigned int A_inc1, unsigned int A_inc2,
901  unsigned int A_size1, unsigned int A_size2,
902  unsigned int A_internal_size1, unsigned int A_internal_size2,
903 
904  const NumericT * B,
905  unsigned int B_start1, unsigned int B_start2,
906  unsigned int B_inc1, unsigned int B_inc2,
907  unsigned int B_internal_size1, unsigned int B_internal_size2)
908 {
909  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
910  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
911 
912  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
913  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
914  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = asin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
915 }
916 
917 
918 // atan
919 template<typename NumericT>
921  NumericT * A,
922  unsigned int A_start1, unsigned int A_start2,
923  unsigned int A_inc1, unsigned int A_inc2,
924  unsigned int A_size1, unsigned int A_size2,
925  unsigned int A_internal_size1, unsigned int A_internal_size2,
926 
927  const NumericT * B,
928  unsigned int B_start1, unsigned int B_start2,
929  unsigned int B_inc1, unsigned int B_inc2,
930  unsigned int B_internal_size1, unsigned int B_internal_size2)
931 {
932  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
933  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
934 
935  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
936  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
937  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = atan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
938 }
939 
940 
941 // ceil
942 template<typename NumericT>
944  NumericT * A,
945  unsigned int A_start1, unsigned int A_start2,
946  unsigned int A_inc1, unsigned int A_inc2,
947  unsigned int A_size1, unsigned int A_size2,
948  unsigned int A_internal_size1, unsigned int A_internal_size2,
949 
950  const NumericT * B,
951  unsigned int B_start1, unsigned int B_start2,
952  unsigned int B_inc1, unsigned int B_inc2,
953  unsigned int B_internal_size1, unsigned int B_internal_size2)
954 {
955  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
956  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
957 
958  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
959  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
960  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = ceil(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
961 }
962 
963 
964 // cos
965 template<typename NumericT>
967  NumericT * A,
968  unsigned int A_start1, unsigned int A_start2,
969  unsigned int A_inc1, unsigned int A_inc2,
970  unsigned int A_size1, unsigned int A_size2,
971  unsigned int A_internal_size1, unsigned int A_internal_size2,
972 
973  const NumericT * B,
974  unsigned int B_start1, unsigned int B_start2,
975  unsigned int B_inc1, unsigned int B_inc2,
976  unsigned int B_internal_size1, unsigned int B_internal_size2)
977 {
978  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
979  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
980 
981  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
982  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
983  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cos(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
984 }
985 
986 
987 // cosh
988 template<typename NumericT>
990  NumericT * A,
991  unsigned int A_start1, unsigned int A_start2,
992  unsigned int A_inc1, unsigned int A_inc2,
993  unsigned int A_size1, unsigned int A_size2,
994  unsigned int A_internal_size1, unsigned int A_internal_size2,
995 
996  const NumericT * B,
997  unsigned int B_start1, unsigned int B_start2,
998  unsigned int B_inc1, unsigned int B_inc2,
999  unsigned int B_internal_size1, unsigned int B_internal_size2)
1000 {
1001  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1002  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1003 
1004  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1005  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1006  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = cosh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1007 }
1008 
1009 
1010 // exp
1011 template<typename NumericT>
1013  NumericT * A,
1014  unsigned int A_start1, unsigned int A_start2,
1015  unsigned int A_inc1, unsigned int A_inc2,
1016  unsigned int A_size1, unsigned int A_size2,
1017  unsigned int A_internal_size1, unsigned int A_internal_size2,
1018 
1019  const NumericT * B,
1020  unsigned int B_start1, unsigned int B_start2,
1021  unsigned int B_inc1, unsigned int B_inc2,
1022  unsigned int B_internal_size1, unsigned int B_internal_size2)
1023 {
1024  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1025  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1026 
1027  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1028  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1029  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = exp(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1030 }
1031 
1032 
1033 // fabs
1034 template<typename NumericT>
1036  NumericT * A,
1037  unsigned int A_start1, unsigned int A_start2,
1038  unsigned int A_inc1, unsigned int A_inc2,
1039  unsigned int A_size1, unsigned int A_size2,
1040  unsigned int A_internal_size1, unsigned int A_internal_size2,
1041 
1042  const NumericT * B,
1043  unsigned int B_start1, unsigned int B_start2,
1044  unsigned int B_inc1, unsigned int B_inc2,
1045  unsigned int B_internal_size1, unsigned int B_internal_size2)
1046 {
1047  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1048  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1049 
1050  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1051  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1052  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = fabs(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1053 }
1054 
1055 
1056 // floor
1057 template<typename NumericT>
1059  NumericT * A,
1060  unsigned int A_start1, unsigned int A_start2,
1061  unsigned int A_inc1, unsigned int A_inc2,
1062  unsigned int A_size1, unsigned int A_size2,
1063  unsigned int A_internal_size1, unsigned int A_internal_size2,
1064 
1065  const NumericT * B,
1066  unsigned int B_start1, unsigned int B_start2,
1067  unsigned int B_inc1, unsigned int B_inc2,
1068  unsigned int B_internal_size1, unsigned int B_internal_size2)
1069 {
1070  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1071  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1072 
1073  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1074  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1075  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = floor(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1076 }
1077 
1078 
1079 // log
1080 template<typename NumericT>
1082  NumericT * A,
1083  unsigned int A_start1, unsigned int A_start2,
1084  unsigned int A_inc1, unsigned int A_inc2,
1085  unsigned int A_size1, unsigned int A_size2,
1086  unsigned int A_internal_size1, unsigned int A_internal_size2,
1087 
1088  const NumericT * B,
1089  unsigned int B_start1, unsigned int B_start2,
1090  unsigned int B_inc1, unsigned int B_inc2,
1091  unsigned int B_internal_size1, unsigned int B_internal_size2)
1092 {
1093  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1094  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1095 
1096  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1097  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1098  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1099 }
1100 
1101 
1102 // log10
1103 template<typename NumericT>
1105  NumericT * A,
1106  unsigned int A_start1, unsigned int A_start2,
1107  unsigned int A_inc1, unsigned int A_inc2,
1108  unsigned int A_size1, unsigned int A_size2,
1109  unsigned int A_internal_size1, unsigned int A_internal_size2,
1110 
1111  const NumericT * B,
1112  unsigned int B_start1, unsigned int B_start2,
1113  unsigned int B_inc1, unsigned int B_inc2,
1114  unsigned int B_internal_size1, unsigned int B_internal_size2)
1115 {
1116  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1117  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1118 
1119  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1120  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1121  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = log10(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1122 }
1123 
1124 
1125 // sin
1126 template<typename NumericT>
1128  NumericT * A,
1129  unsigned int A_start1, unsigned int A_start2,
1130  unsigned int A_inc1, unsigned int A_inc2,
1131  unsigned int A_size1, unsigned int A_size2,
1132  unsigned int A_internal_size1, unsigned int A_internal_size2,
1133 
1134  const NumericT * B,
1135  unsigned int B_start1, unsigned int B_start2,
1136  unsigned int B_inc1, unsigned int B_inc2,
1137  unsigned int B_internal_size1, unsigned int B_internal_size2)
1138 {
1139  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1140  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1141 
1142  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1143  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1144  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sin(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1145 }
1146 
1147 
1148 // sinh
1149 template<typename NumericT>
1151  NumericT * A,
1152  unsigned int A_start1, unsigned int A_start2,
1153  unsigned int A_inc1, unsigned int A_inc2,
1154  unsigned int A_size1, unsigned int A_size2,
1155  unsigned int A_internal_size1, unsigned int A_internal_size2,
1156 
1157  const NumericT * B,
1158  unsigned int B_start1, unsigned int B_start2,
1159  unsigned int B_inc1, unsigned int B_inc2,
1160  unsigned int B_internal_size1, unsigned int B_internal_size2)
1161 {
1162  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1163  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1164 
1165  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1166  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1167  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sinh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1168 }
1169 
1170 
1171 // sqrt
1172 template<typename NumericT>
1174  NumericT * A,
1175  unsigned int A_start1, unsigned int A_start2,
1176  unsigned int A_inc1, unsigned int A_inc2,
1177  unsigned int A_size1, unsigned int A_size2,
1178  unsigned int A_internal_size1, unsigned int A_internal_size2,
1179 
1180  const NumericT * B,
1181  unsigned int B_start1, unsigned int B_start2,
1182  unsigned int B_inc1, unsigned int B_inc2,
1183  unsigned int B_internal_size1, unsigned int B_internal_size2)
1184 {
1185  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1186  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1187 
1188  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1189  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1190  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = sqrt(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1191 }
1192 
1193 
1194 // tan
1195 template<typename NumericT>
1197  NumericT * A,
1198  unsigned int A_start1, unsigned int A_start2,
1199  unsigned int A_inc1, unsigned int A_inc2,
1200  unsigned int A_size1, unsigned int A_size2,
1201  unsigned int A_internal_size1, unsigned int A_internal_size2,
1202 
1203  const NumericT * B,
1204  unsigned int B_start1, unsigned int B_start2,
1205  unsigned int B_inc1, unsigned int B_inc2,
1206  unsigned int B_internal_size1, unsigned int B_internal_size2)
1207 {
1208  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1209  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1210 
1211  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1212  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1213  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tan(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1214 }
1215 
1216 
1217 // tanh
1218 template<typename NumericT>
1220  NumericT * A,
1221  unsigned int A_start1, unsigned int A_start2,
1222  unsigned int A_inc1, unsigned int A_inc2,
1223  unsigned int A_size1, unsigned int A_size2,
1224  unsigned int A_internal_size1, unsigned int A_internal_size2,
1225 
1226  const NumericT * B,
1227  unsigned int B_start1, unsigned int B_start2,
1228  unsigned int B_inc1, unsigned int B_inc2,
1229  unsigned int B_internal_size1, unsigned int B_internal_size2)
1230 {
1231  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1232  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1233 
1234  for (unsigned int col = col_gid; col < A_size2; col += gridDim.x)
1235  for (unsigned int row = row_gid; row < A_size1; row += blockDim.x)
1236  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] = tanh(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1]);
1237 }
1238 
1239 
1240 
1241 //
1242 // matrix-vector product
1243 //
1244 
1245 template<typename NumericT>
1246 __global__ void vec_mul_col_kernel(
1247  const NumericT * A,
1248  unsigned int A_row_start,
1249  unsigned int A_col_start,
1250  unsigned int A_row_inc,
1251  unsigned int A_col_inc,
1252  unsigned int A_row_size,
1253  unsigned int A_col_size,
1254  unsigned int A_internal_rows,
1255  unsigned int A_internal_cols,
1256  const NumericT * v,
1257  unsigned int v_start,
1258  unsigned int v_inc,
1259  unsigned int v_size,
1260  NumericT * result,
1261  unsigned int result_start,
1262  unsigned int result_inc,
1263  unsigned int result_size)
1264 {
1265 
1266  for (unsigned int row = blockIdx.x * blockDim.x + threadIdx.x; row < A_row_size; row += gridDim.x * blockDim.x)
1267  {
1268  NumericT dot_prod = 0;
1269  for (unsigned int col = 0; col < A_col_size; ++col)
1270  dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col];
1271  result[row * result_inc + result_start] = dot_prod;
1272  }
1273 }
1274 
1275 
1276 template<typename NumericT>
1278  const NumericT * A,
1279  unsigned int A_row_start,
1280  unsigned int A_col_start,
1281  unsigned int A_row_inc,
1282  unsigned int A_col_inc,
1283  unsigned int A_row_size,
1284  unsigned int A_col_size,
1285  unsigned int A_internal_rows,
1286  unsigned int A_internal_cols,
1287  const NumericT * v,
1288  unsigned int v_start,
1289  unsigned int v_inc,
1290  unsigned int v_size,
1291  NumericT * result,
1292  unsigned int result_start,
1293  unsigned int result_inc,
1294  unsigned int result_size)
1295 {
1296  __shared__ NumericT work[128];
1297 
1298  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1299  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1300  unsigned int lid = threadIdx.x;
1301 
1302  for (unsigned int row = row_gid; row < A_col_size; row += gridDim.x)
1303  {
1304  NumericT dot_prod = 0;
1305  for (unsigned int col = col_gid; col < A_row_size; col += blockDim.x)
1306  dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col];
1307  work[lid] = dot_prod;
1308 
1309  for (unsigned int stride = blockDim.x/2; stride>0; stride>>=1){
1310  __syncthreads();
1311  if (lid < stride)
1312  work[lid] += work[lid+stride];
1313  }
1314 
1315  if (lid == 0)
1316  result[row * result_inc + result_start] = work[0];
1317  }
1318 }
1319 
1320 
1321 //
1322 // matrix-matrix products
1323 //
1324 
1325 
1326 
1327 
1328 //
1329 // scaled rank-1-update
1330 //
1331 
1332 // alpha on CPU
1333 template<typename NumericT>
1335  NumericT * A,
1336  unsigned int A_start1, unsigned int A_start2,
1337  unsigned int A_inc1, unsigned int A_inc2,
1338  unsigned int A_size1, unsigned int A_size2,
1339  unsigned int A_internal_size1, unsigned int A_internal_size2,
1340 
1341  NumericT val,
1342  unsigned int options2,
1343 
1344  const NumericT * vec1,
1345  unsigned int start1,
1346  unsigned int inc1,
1347  unsigned int size1,
1348 
1349  const NumericT * vec2,
1350  unsigned int start2,
1351  unsigned int inc2,
1352  unsigned int size2)
1353 {
1354  NumericT alpha = val;
1355  if (options2 & (1 << 0))
1356  alpha = -alpha;
1357  if (options2 & (1 << 1))
1358  alpha = NumericT(1) / alpha;
1359 
1360  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1361  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1362 
1363  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1364  {
1365  NumericT tmp = alpha * vec1[row * inc1 + start1];
1366  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1367  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1368  }
1369 }
1370 
1371 
1372 // alpha on GPU
1373 template<typename NumericT>
1375  NumericT * A,
1376  unsigned int A_start1, unsigned int A_start2,
1377  unsigned int A_inc1, unsigned int A_inc2,
1378  unsigned int A_size1, unsigned int A_size2,
1379  unsigned int A_internal_size1, unsigned int A_internal_size2,
1380 
1381  const NumericT * val,
1382  unsigned int options2,
1383 
1384  const NumericT * vec1,
1385  unsigned int start1,
1386  unsigned int inc1,
1387  unsigned int size1,
1388 
1389  const NumericT * vec2,
1390  unsigned int start2,
1391  unsigned int inc2,
1392  unsigned int size2)
1393 {
1394  NumericT alpha = *val;
1395  if (options2 & (1 << 0))
1396  alpha = -alpha;
1397  if (options2 & (1 << 1))
1398  alpha = NumericT(1) / alpha;
1399 
1400  unsigned int row_gid = (blockIdx.x * blockDim.x + threadIdx.x) / blockDim.x;
1401  unsigned int col_gid = (blockIdx.x * blockDim.x + threadIdx.x) % blockDim.x;
1402 
1403  for (unsigned int row = row_gid; row < A_size1; row += gridDim.x)
1404  {
1405  NumericT tmp = alpha * vec1[row * inc1 + start1];
1406  for (unsigned int col = col_gid; col < A_size2; col += blockDim.x)
1407  A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2];
1408  }
1409 }
1410 
1411 
1412 template <typename T>
1414  T * A,
1415  T * D,
1416  T * S,
1417  unsigned int size1,
1418  unsigned int size2,
1419  unsigned int stride)
1420 {
1421  unsigned int size = min(size1, size2);
1422  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1423  S[0] = 0;
1424 
1425  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1426  i < size;
1427  i += gridDim.x * blockDim.x)
1428  {
1429  D[i] = A[i*stride + i];
1430  S[i+1] = (i + 1 < size2) ? A[i*stride + (i + 1)] : 0;
1431  }
1432 }
1433 
1434 template <typename T>
1436  T * A,
1437  T * D,
1438  T * S,
1439  unsigned int size1,
1440  unsigned int size2,
1441  unsigned int stride)
1442 {
1443  unsigned int size = min(size1, size2);
1444  if(blockIdx.x * blockDim.x + threadIdx.x == 0)
1445  S[0] = 0;
1446 
1447  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x;
1448  i < size;
1449  i += gridDim.x * blockDim.x)
1450  {
1451  D[i] = A[i*stride + i];
1452  S[i+1] = (i + 1 < size2) ? A[i + (i + 1) * stride] : 0;
1453  }
1454 }
1455 
1456 
1457 
1458 template<typename T>
1460  T * A,
1461  T * V,
1462  unsigned int row_start,
1463  unsigned int col_start,
1464  unsigned int size,
1465  unsigned int stride)
1466 {
1467  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1468  unsigned int sz = gridDim.x * blockDim.x;
1469 
1470  for(unsigned int i = row_start + x; i < size; i += sz)
1471  {
1472  V[i - row_start] = A[i * stride + col_start];
1473  }
1474 }
1475 
1476 template<typename T>
1478  T * A,
1479  T * V,
1480  unsigned int row_start,
1481  unsigned int col_start,
1482  unsigned int size,
1483  unsigned int stride)
1484 {
1485  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1486  unsigned int sz = gridDim.x * blockDim.x;
1487 
1488  for(unsigned int i = row_start + x; i < size; i += sz)
1489  {
1490  V[i - row_start] = A[i + col_start * stride];
1491  }
1492 }
1493 
1494 template<typename T>
1496  T * A,
1497  T * V,
1498  unsigned int row_start,
1499  unsigned int col_start,
1500  unsigned int size,
1501  unsigned int stride)
1502 {
1503  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1504  unsigned int sz = gridDim.x * blockDim.x;
1505 
1506  for(unsigned int i = col_start + x; i < size; i += sz)
1507  {
1508  V[i - col_start] = A[row_start * stride + i];
1509  }
1510 
1511 }
1512 
1513 template<typename T>
1515  T * A,
1516  T * V,
1517  unsigned int row_start,
1518  unsigned int col_start,
1519  unsigned int size,
1520  unsigned int stride)
1521 {
1522  unsigned int x = blockIdx.x * blockDim.x + threadIdx.x;
1523  unsigned int sz = gridDim.x * blockDim.x;
1524 
1525  for(unsigned int i = col_start + x; i < size; i += sz)
1526  {
1527  V[i - col_start] = A[row_start + i * stride];
1528  }
1529 
1530 }
1531 
1532 
1533 
1534 template<typename T>
1536  T * A,
1537  T * V, //householder vector
1538  unsigned int row_start,
1539  unsigned int col_start,
1540  unsigned int size1,
1541  unsigned int size2,
1542  unsigned int stride)
1543 {
1544  T ss = 0;
1545 
1546  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1547  i < size2;
1548  i += gridDim.x * blockDim.x)
1549  {
1550  ss = 0;
1551  for(unsigned int j = row_start; j < size1; j++)
1552  ss = ss +(V[j] * A[j * stride + i]);
1553 
1554  for(unsigned int j = row_start; j < size1; j++)
1555  A[j * stride + i] = A[j * stride + i] - (2 * V[j] * ss);
1556  }
1557 }
1558 
1559 template<typename T>
1561  T * A,
1562  T * V, //householder vector
1563  unsigned int row_start,
1564  unsigned int col_start,
1565  unsigned int size1,
1566  unsigned int size2,
1567  unsigned int stride)
1568 {
1569  T ss = 0;
1570 
1571  for(unsigned int i = blockIdx.x * blockDim.x + threadIdx.x + col_start;
1572  i < size2;
1573  i += gridDim.x * blockDim.x)
1574  {
1575  ss = 0;
1576  for(unsigned int j = row_start; j < size1; j++)
1577  ss = ss +(V[j] * A[j + i * stride]);
1578 
1579  for(unsigned int j = row_start; j < size1; j++)
1580  A[j + i * stride] = A[j + i * stride] - (2 * V[j] * ss);
1581  }
1582 }
1583 
1584 
1585 
1586 template<typename T>
1588  T * A,
1589  T * V, //householder vector
1590  unsigned int row_start,
1591  unsigned int col_start,
1592  unsigned int size1,
1593  unsigned int size2,
1594  unsigned int stride)
1595 {
1596  __shared__ T sums[128];
1597  T ss = 0;
1598 
1599  for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1600  {
1601  ss = 0;
1602  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1603  ss = ss + (V[j] * A[i * stride + j]);
1604  sums[threadIdx.x] = ss;
1605 
1606  __syncthreads();
1607  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1608  __syncthreads();
1609 
1610  T sum_Av = sums[0];
1611 
1612  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1613  A[i * stride + j] = A[i * stride + j] - (2 * V[j] * sum_Av);
1614  }
1615 }
1616 
1617 template<typename T>
1619  T * A,
1620  T * V, //householder vector
1621  unsigned int row_start,
1622  unsigned int col_start,
1623  unsigned int size1,
1624  unsigned int size2,
1625  unsigned int stride)
1626 {
1627  __shared__ T sums[128];
1628  T ss = 0;
1629 
1630  for(unsigned int i = blockIdx.x + row_start; i < size1; i+= gridDim.x)
1631  {
1632  ss = 0;
1633  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1634  ss = ss + (V[j] * A[i + j * stride]);
1635  sums[threadIdx.x] = ss;
1636 
1637  __syncthreads();
1638  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1639  __syncthreads();
1640 
1641  T sum_Av = sums[0];
1642 
1643  for(unsigned int j = threadIdx.x; j < size2; j+= blockDim.x)
1644  A[i + j * stride] = A[i + j * stride] - (2 * V[j] * sum_Av);
1645  }
1646 }
1647 
1648 
1649 
1650 template<typename T>
1651 __device__ void col_reduce_lcl_array(
1652  T * sums,
1653  unsigned int th_Idx,
1654  unsigned int bl_Dim)
1655 {
1656  unsigned int step = bl_Dim >> 1;
1657 
1658  while(step > 0)
1659  {
1660  if(th_Idx < step)
1661  sums[th_Idx] += sums[th_Idx + step];
1662  step >>= 1;
1663  __syncthreads();
1664  }
1665 }
1666 
1667 
1668 template <typename T>
1670  T * QL,
1671  T * V,
1672  unsigned int size1,
1673  unsigned int strideQ)
1674 {
1675  __shared__ T sums[128];
1676  T ss = 0;
1677  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1678  {
1679  ss = 0;
1680  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1681  ss = ss + (V[j] * QL[i * strideQ + j]);
1682  sums[threadIdx.x] = ss;
1683 
1684  __syncthreads();
1685  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1686  __syncthreads();
1687 
1688  T sum_Qv = sums[0];
1689 
1690  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1691  QL[i * strideQ + j] = QL[i * strideQ + j] - (2 * V[j] * sum_Qv);
1692  }
1693 }
1694 
1695 template <typename T>
1697  T * QL,
1698  T * V,
1699  unsigned int size1,
1700  unsigned int strideQ)
1701 {
1702  __shared__ T sums[128];
1703  T ss = 0;
1704  for(unsigned int i = blockIdx.x; i < size1; i += gridDim.x)
1705  {
1706  ss = 0;
1707  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1708  ss = ss + (V[j] * QL[i + j * strideQ]);
1709  sums[threadIdx.x] = ss;
1710 
1711  __syncthreads();
1712  col_reduce_lcl_array(sums, threadIdx.x, blockDim.x);
1713  __syncthreads();
1714 
1715  T sum_Qv = sums[0];
1716 
1717  for(unsigned int j = threadIdx.x; j < size1; j += blockDim.x)
1718  QL[i + j * strideQ] = QL[i + j * strideQ] - (2 * V[j] * sum_Qv);
1719  }
1720 }
1721 
1722 
1723 template <typename T>
1725  T * matr,
1726  T * cs,
1727  T * ss,
1728  unsigned int size,
1729  unsigned int stride,
1730  unsigned int start_i,
1731  unsigned int end_i)
1732 {
1733  unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1734  __shared__ T cs_lcl[256];
1735  __shared__ T ss_lcl[256];
1736 
1737  T x = (j < size) ? matr[(end_i + 1) + j * stride] : 0;
1738 
1739  unsigned int elems_num = end_i - start_i + 1;
1740  unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1741 
1742  for(unsigned int block_id = 0; block_id < block_num; block_id++)
1743  {
1744  unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
1745 
1746  if(threadIdx.x < to)
1747  {
1748  cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1749  ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1750  }
1751  __syncthreads();
1752  if(j < size)
1753  {
1754  for(unsigned int ind = 0; ind < to; ind++)
1755  {
1756  unsigned int i = end_i - (ind + block_id * blockDim.x);
1757  T z = matr[i + j * stride];
1758  T cs_val = cs_lcl[ind];
1759  T ss_val = ss_lcl[ind];
1760  matr[(i + 1) + j * stride] = x * cs_val + z * ss_val;
1761  x = -x * ss_val + z * cs_val;
1762  }
1763  }
1764  __syncthreads();
1765  }
1766  if(j < size)
1767  matr[(start_i) + j * stride] = x;
1768 }
1769 
1770 template <typename T>
1772  T * matr,
1773  T * cs,
1774  T * ss,
1775  unsigned int size,
1776  unsigned int stride,
1777  unsigned int start_i,
1778  unsigned int end_i)
1779 {
1780  unsigned int j = blockIdx.x * blockDim.x + threadIdx.x;
1781  __shared__ T cs_lcl[256];
1782  __shared__ T ss_lcl[256];
1783 
1784  T x = (j < size) ? matr[(end_i + 1) *stride + j] : 0;
1785 
1786  unsigned int elems_num = end_i - start_i + 1;
1787  unsigned int block_num = (elems_num + blockDim.x - 1) / blockDim.x;
1788 
1789  for(unsigned int block_id = 0; block_id < block_num; block_id++)
1790  {
1791  unsigned int to = min(elems_num - block_id * blockDim.x, blockDim.x);
1792 
1793  if(threadIdx.x < to)
1794  {
1795  cs_lcl[threadIdx.x] = cs[end_i - (threadIdx.x + block_id * blockDim.x)];
1796  ss_lcl[threadIdx.x] = ss[end_i - (threadIdx.x + block_id * blockDim.x)];
1797  }
1798  __syncthreads();
1799  if(j < size)
1800  {
1801  for(unsigned int ind = 0; ind < to; ind++)
1802  {
1803  unsigned int i = end_i - (ind + block_id * blockDim.x);
1804  T z = matr[i *stride + j];
1805  T cs_val = cs_lcl[ind];
1806  T ss_val = ss_lcl[ind];
1807  matr[(i + 1) * stride + j] = x * cs_val + z * ss_val;
1808  x = -x * ss_val + z * cs_val;
1809  }
1810  }
1811  __syncthreads();
1812  }
1813  if(j < size)
1814  matr[(start_i) * stride + j] = x;
1815 }
1816 
1817 
1818 
1819 
1820 #define VIENNACL_SECTION_SIZE 512
1821 template <typename T>
1822 __global__ void inclusive_scan_kernel_1(
1823  T * X,
1824  unsigned int startX,
1825  unsigned int incX,
1826  unsigned int InputSize,
1827 
1828  T * Y,
1829  unsigned int startY,
1830  unsigned int incY,
1831 
1832  T * S,
1833  unsigned int startS,
1834  unsigned int incS)
1835 {
1836 
1837  __shared__ T XY[VIENNACL_SECTION_SIZE];
1838  int i = blockIdx.x * blockDim.x + threadIdx.x;
1839  if(i < InputSize)
1840  XY[threadIdx.x] = X[i * incX + startX];
1841 
1842  for(unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1843  {
1844  __syncthreads();
1845  int index = (threadIdx.x + 1) * 2 * stride - 1;
1846  if(index < blockDim.x)
1847  XY[index] += XY[index - stride];
1848  }
1849 
1850  for(int stride = VIENNACL_SECTION_SIZE / 4; stride > 0; stride /= 2)
1851  {
1852  __syncthreads();
1853  int index = (threadIdx.x + 1) * 2 * stride - 1;
1854  if(index + stride < blockDim.x)
1855  XY[index + stride] += XY[index];
1856  }
1857  __syncthreads();
1858  Y[i * incY + startY] = XY[threadIdx.x];
1859  __syncthreads();
1860  if(threadIdx.x == 0)
1861  {
1862  S[blockIdx.x * incS + startS] = XY[VIENNACL_SECTION_SIZE - 1];
1863  }
1864 }
1865 
1866 
1867 template <typename T>
1868 __global__ void exclusive_scan_kernel_1(
1869  T * X,
1870  unsigned int startX,
1871  unsigned int incX,
1872  unsigned int InputSize,
1873 
1874  T * Y,
1875  unsigned int startY,
1876  unsigned int incY,
1877 
1878  T * S,
1879  unsigned int startS,
1880  unsigned int incS)
1881 {
1882 
1883  __shared__ T XY[VIENNACL_SECTION_SIZE];
1884  int i = blockIdx.x * blockDim.x + threadIdx.x;
1885  if(i < InputSize + 1 && i != 0)
1886  XY[threadIdx.x] = X[(i - 1) * incX + startX];
1887  if(i == 0)
1888  XY[0] = 0;
1889 
1890  for(unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1891  {
1892  __syncthreads();
1893  int index = (threadIdx.x + 1) * 2 * stride - 1;
1894  if(index < blockDim.x)
1895  XY[index] += XY[index - stride];
1896  }
1897 
1898  for(int stride = VIENNACL_SECTION_SIZE / 4; stride > 0; stride /= 2)
1899  {
1900  __syncthreads();
1901  int index = (threadIdx.x + 1) * 2 * stride - 1;
1902  if(index + stride < blockDim.x)
1903  XY[index + stride] += XY[index];
1904  }
1905  __syncthreads();
1906 
1907  Y[i * incY + startY] = XY[threadIdx.x];
1908 
1909  __syncthreads();
1910  if(threadIdx.x == 0)
1911  {
1912  S[blockIdx.x * incS + startS] = XY[VIENNACL_SECTION_SIZE - 1];
1913 
1914  }
1915 }
1916 
1917 template <typename T>
1918 __global__ void scan_kernel_2(
1919  T * S_ref,
1920  unsigned int startS_ref,
1921  unsigned int incS_ref,
1922 
1923  T * S,
1924  unsigned int startS,
1925  unsigned int incS,
1926  unsigned int InputSize)
1927 
1928 {
1929 
1930  __shared__ T XY[VIENNACL_SECTION_SIZE];
1931 
1932  int i = blockIdx.x * blockDim.x + threadIdx.x;
1933  if(i < InputSize)
1934  XY[threadIdx.x] = S[i * incS + startS];
1935 
1936  for(unsigned int stride = 1; stride < blockDim.x; stride *= 2)
1937  {
1938  __syncthreads();
1939  int index = (threadIdx.x + 1) * 2 * stride - 1;
1940  if(index < blockDim.x)
1941  XY[index] += XY[index - stride];
1942  }
1943 
1944  for(int stride = VIENNACL_SECTION_SIZE / 4; stride > 0; stride /= 2)
1945  {
1946  __syncthreads();
1947  int index = (threadIdx.x + 1) * 2 * stride - 1;
1948  if(index + stride < blockDim.x)
1949  XY[index + stride] += XY[index];
1950  }
1951  __syncthreads();
1952  if(i < InputSize)
1953  {
1954  S[i * incS + startS] = XY[threadIdx.x];
1955  S_ref[i * incS_ref + startS_ref] = XY[threadIdx.x];
1956  }
1957 }
1958 
1959 template <typename T>
1960 __global__ void scan_kernel_3(
1961  T * S_ref,
1962  unsigned int startS_ref,
1963  unsigned int incS_ref,
1964 
1965  T * S,
1966  unsigned int startS,
1967  unsigned int incS)
1968 
1969 {
1970  int i = blockIdx.x * blockDim.x + threadIdx.x;
1971  for(int j = 1; j <= blockIdx.x; j++)
1972  {
1973  S[i * incS + startS] += S_ref[(j * blockDim.x - 1) * incS_ref + startS_ref];
1974  }
1975 }
1976 
1977 
1978 template <typename T>
1979 __global__ void scan_kernel_4(
1980  T * S,
1981  unsigned int startS,
1982  unsigned int incS,
1983 
1984  T * Y,
1985  unsigned int startY,
1986  unsigned int incY,
1987  unsigned int OutputSize)
1988 
1989 {
1990  __syncthreads();
1991  unsigned int i = (blockIdx.x + 1) * blockDim.x + threadIdx.x;
1992 
1993  if(i < OutputSize)
1994  Y[i * incY + startY] += S[blockIdx.x * incS + startS];
1995 
1996 }
1997 #undef VIENNACL_SECTION_SIZE
1998 
1999 
2000 
2001 } // namespace cuda
2002 } //namespace linalg
2003 } //namespace viennacl
2004 
2005 
2006 #endif
__global__ void matrix_col_element_fabs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
__global__ void house_update_A_right_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_tanh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void scan_kernel_2(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS, unsigned int InputSize)
__global__ void copy_row_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
vcl_size_t size1(MatrixType const &mat)
Generic routine for obtaining the number of rows of a matrix (ViennaCL, uBLAS, etc.)
Definition: size.hpp:216
__global__ void house_update_A_left_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void matrix_col_element_ceil_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
result_of::size_type< T >::type start1(T const &obj)
Definition: start.hpp:65
__global__ void matrix_col_element_cos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void bidiag_pack_column_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
endcode *Final step
__global__ void house_update_A_left_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
void dot_prod(MatrixT const &A, unsigned int beg_ind, NumericT &res)
Dot prod of particular column of martix A with it's self starting at a certain index beg_ind...
Definition: qr.hpp:182
__global__ void matrix_col_element_acos_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
result_of::size_type< MatrixType >::type size2(MatrixType const &mat)
Generic routine for obtaining the number of columns of a matrix (ViennaCL, uBLAS, etc...
Definition: size.hpp:245
__global__ void matrix_col_element_sin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
__global__ void scan_kernel_4(T *S, unsigned int startS, unsigned int incS, T *Y, unsigned int startY, unsigned int incY, unsigned int OutputSize)
__global__ void house_update_QL_row_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void scaled_rank1_update_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT val, unsigned int options2, const NumericT *vec1, unsigned int start1, unsigned int inc1, unsigned int size1, const NumericT *vec2, unsigned int start2, unsigned int inc2, unsigned int size2)
__global__ void matrix_col_element_tan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void ambm_m_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, NumericT fac3, unsigned int options3, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
result_of::size_type< T >::type start2(T const &obj)
Definition: start.hpp:84
__global__ void matrix_col_element_floor_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void am_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT fac2, unsigned int options2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_row_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void matrix_col_element_sqrt_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_atan_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
#define VIENNACL_SECTION_SIZE
__global__ void matrix_col_element_abs_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_log10_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void inclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
__global__ void matrix_col_diagonal_assign_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, NumericT alpha)
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
__global__ void scan_kernel_3(T *S_ref, unsigned int startS_ref, unsigned int incS_ref, T *S, unsigned int startS, unsigned int incS)
__device__ void col_reduce_lcl_array(T *sums, unsigned int th_Idx, unsigned int bl_Dim)
__global__ void matrix_col_element_cosh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void copy_col_column_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size, unsigned int stride)
__global__ void element_op_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void house_update_A_right_row_major_kernel(T *A, T *V, unsigned int row_start, unsigned int col_start, unsigned int size1, unsigned int size2, unsigned int stride)
__global__ void givens_next_row_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void matrix_col_element_log_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void element_op_int_col_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2, const NumericT *C, unsigned int C_start1, unsigned int C_start2, unsigned int C_inc1, unsigned int C_inc2, unsigned int C_internal_size1, unsigned int C_internal_size2, unsigned int op_type)
__global__ void matrix_col_element_exp_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void trans_vec_mul_col_kernel(const NumericT *A, unsigned int A_row_start, unsigned int A_col_start, unsigned int A_row_inc, unsigned int A_col_inc, unsigned int A_row_size, unsigned int A_col_size, unsigned int A_internal_rows, unsigned int A_internal_cols, const NumericT *v, unsigned int v_start, unsigned int v_inc, unsigned int v_size, NumericT *result, unsigned int result_start, unsigned int result_inc, unsigned int result_size)
__global__ void matrix_col_element_asin_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void matrix_col_element_sinh_kernel(NumericT *A, unsigned int A_start1, unsigned int A_start2, unsigned int A_inc1, unsigned int A_inc2, unsigned int A_size1, unsigned int A_size2, unsigned int A_internal_size1, unsigned int A_internal_size2, const NumericT *B, unsigned int B_start1, unsigned int B_start2, unsigned int B_inc1, unsigned int B_inc2, unsigned int B_internal_size1, unsigned int B_internal_size2)
__global__ void house_update_QL_column_major_kernel(T *QL, T *V, unsigned int size1, unsigned int strideQ)
__global__ void exclusive_scan_kernel_1(T *X, unsigned int startX, unsigned int incX, unsigned int InputSize, T *Y, unsigned int startY, unsigned int incY, T *S, unsigned int startS, unsigned int incS)
__global__ void givens_next_column_major_kernel(T *matr, T *cs, T *ss, unsigned int size, unsigned int stride, unsigned int start_i, unsigned int end_i)
__global__ void bidiag_pack_row_major_kernel(T *A, T *D, T *S, unsigned int size1, unsigned int size2, unsigned int stride)
NumericT min(std::vector< NumericT > const &v1)
Definition: maxmin.hpp:91