ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
fft_operations.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
2 #define VIENNACL_LINALG_CUDA_FFT_OPERATIONS_HPP_
3 
4 /* =========================================================================
5  Copyright (c) 2010-2014, Institute for Microelectronics,
6  Institute for Analysis and Scientific Computing,
7  TU Wien.
8  Portions of this software are copyright by UChicago Argonne, LLC.
9 
10  -----------------
11  ViennaCL - The Vienna Computing Library
12  -----------------
13 
14  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
15 
16  (A list of authors and contributors can be found in the PDF manual)
17 
18  License: MIT (X11), see file LICENSE in the base directory
19 ============================================================================= */
20 
24 #include <cmath>
25 #include <viennacl/matrix.hpp>
26 #include <viennacl/vector.hpp>
27 
28 #include "viennacl/forwards.h"
29 #include "viennacl/scalar.hpp"
30 #include "viennacl/tools/tools.hpp"
33 
34 namespace viennacl
35 {
36 namespace linalg
37 {
38 namespace cuda
39 {
40 namespace detail
41 {
42  namespace fft
43  {
45 
47  {
48  vcl_size_t bits_datasize = 0;
49  vcl_size_t ds = 1;
50 
51  while (ds < size)
52  {
53  ds = ds << 1;
54  bits_datasize++;
55  }
56 
57  return bits_datasize;
58  }
59 
61  {
62  n = n - 1;
63 
64  vcl_size_t power = 1;
65 
66  while (power < sizeof(vcl_size_t) * 8)
67  {
68  n = n | (n >> power);
69  power *= 2;
70  }
71 
72  return n + 1;
73  }
74 
75  } //namespace fft
76 } //namespace detail
77 
78 // addition
79 inline __host__ __device__ float2 operator+(float2 a, float2 b)
80 {
81  return make_float2(a.x + b.x, a.y + b.y);
82 }
83 
84 // subtract
85 inline __host__ __device__ float2 operator-(float2 a, float2 b)
86 {
87  return make_float2(a.x - b.x, a.y - b.y);
88 }
89 // division
90 template<typename SCALARTYPE>
91 inline __device__ float2 operator/(float2 a,SCALARTYPE b)
92 {
93  return make_float2(a.x/b, a.y/b);
94 }
95 
96 //multiplication
97 inline __device__ float2 operator*(float2 in1, float2 in2)
98 {
99  return make_float2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
100 }
101 
102 // addition
103 inline __host__ __device__ double2 operator+(double2 a, double2 b)
104 {
105  return make_double2(a.x + b.x, a.y + b.y);
106 }
107 
108 // subtraction
109 inline __host__ __device__ double2 operator-(double2 a, double2 b)
110 {
111  return make_double2(a.x - b.x, a.y - b.y);
112 }
113 
114 // division
115 template<typename SCALARTYPE>
116 inline __host__ __device__ double2 operator/(double2 a,SCALARTYPE b)
117 {
118  return make_double2(a.x/b, a.y/b);
119 }
120 
121 //multiplication
122 inline __host__ __device__ double2 operator*(double2 in1, double2 in2)
123 {
124  return make_double2(in1.x * in2.x - in1.y * in2.y, in1.x * in2.y + in1.y * in2.x);
125 }
126 
127 inline __device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
128 {
129  v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1);
130  v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2);
131  v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4);
132  v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8);
133  v = (v >> 16) | (v << 16);
134  v = v >> (32 - bit_size);
135  return v;
136 }
137 
138 template<typename Numeric2T, typename NumericT>
139 __global__ void fft_direct(
140  const Numeric2T * input,
141  Numeric2T * output,
142  unsigned int size,
143  unsigned int stride,
144  unsigned int batch_num,
145  NumericT sign,
146  bool is_row_major)
147 {
148 
149  const NumericT NUM_PI(3.14159265358979323846);
150 
151  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
152  {
153  for (unsigned int k = blockIdx.x * blockDim.x + threadIdx.x; k < size; k += gridDim.x * blockDim.x)
154  {
155  Numeric2T f;
156  f.x = 0;
157  f.y = 0;
158 
159  for (unsigned int n = 0; n < size; n++)
160  {
161  Numeric2T in;
162  if (!is_row_major)
163  in = input[batch_id * stride + n]; //input index here
164  else
165  in = input[n * stride + batch_id];//input index here
166 
167  NumericT sn,cs;
168  NumericT arg = sign * 2 * NUM_PI * k / size * n;
169  sn = sin(arg);
170  cs = cos(arg);
171 
172  Numeric2T ex;
173  ex.x = cs;
174  ex.y = sn;
175  Numeric2T tmp;
176  tmp.x = in.x * ex.x - in.y * ex.y;
177  tmp.y = in.x * ex.y + in.y * ex.x;
178  f = f + tmp;
179  }
180 
181  if (!is_row_major)
182  output[batch_id * stride + k] = f; // output index here
183  else
184  output[k * stride + batch_id] = f;// output index here
185  }
186  }
187 }
188 
195 template<typename NumericT, unsigned int AlignmentV>
199  NumericT sign = NumericT(-1),
201 {
203 
204  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(detail::cuda_arg<NumericT>(in)),
205  reinterpret_cast< numeric2_type *>(detail::cuda_arg<NumericT>(out)),
206  static_cast<unsigned int>(size),
207  static_cast<unsigned int>(stride),
208  static_cast<unsigned int>(batch_num),
209  sign,
211  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
212 }
213 
220 template<typename NumericT, unsigned int AlignmentV>
223  vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num,
224  NumericT sign = NumericT(-1),
226 {
228 
229  fft_direct<<<128,128>>>(reinterpret_cast<const numeric2_type *>(detail::cuda_arg<NumericT>(in)),
230  reinterpret_cast< numeric2_type *>(detail::cuda_arg<NumericT>(out)),
231  static_cast<unsigned int>(size),
232  static_cast<unsigned int>(stride),
233  static_cast<unsigned int>(batch_num),
234  sign,
236  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_direct");
237 }
238 
239 template<typename NumericT>
240 __global__ void fft_reorder(NumericT * input,
241  unsigned int bit_size,
242  unsigned int size,
243  unsigned int stride,
244  unsigned int batch_num,
245  bool is_row_major)
246 {
247 
248  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
249  unsigned int glb_sz = gridDim.x * blockDim.x;
250 
251  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
252  {
253  for (unsigned int i = glb_id; i < size; i += glb_sz)
254  {
255  unsigned int v = get_reorder_num(i, bit_size);
256 
257  if (i < v)
258  {
259  if (!is_row_major)
260  {
261  NumericT tmp = input[batch_id * stride + i]; // index
262  input[batch_id * stride + i] = input[batch_id * stride + v];//index
263  input[batch_id * stride + v] = tmp;//index
264  }
265  else
266  {
267  NumericT tmp = input[i * stride + batch_id];
268  input[i * stride + batch_id] = input[v * stride + batch_id];
269  input[v * stride + batch_id] = tmp;
270  }
271  }
272  }
273  }
274 }
275 
276 /***
277  * This function performs reorder of input data. Indexes are sorted in bit-reversal order.
278  * Such reordering should be done before in-place FFT.
279  */
280 template<typename NumericT, unsigned int AlignmentV>
282  vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num,
284 {
286 
287  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
288  static_cast<unsigned int>(bits_datasize),
289  static_cast<unsigned int>(size),
290  static_cast<unsigned int>(stride),
291  static_cast<unsigned int>(batch_num),
292  static_cast<bool>(data_order));
293  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
294 }
295 
296 template<typename Numeric2T, typename NumericT>
297 __global__ void fft_radix2_local(Numeric2T * input,
298  unsigned int bit_size,
299  unsigned int size,
300  unsigned int stride,
301  unsigned int batch_num,
302  NumericT sign,
303  bool is_row_major)
304 {
305  __shared__ Numeric2T lcl_input[1024];
306  unsigned int grp_id = blockIdx.x;
307  unsigned int grp_num = gridDim.x;
308 
309  unsigned int lcl_sz = blockDim.x;
310  unsigned int lcl_id = threadIdx.x;
311  const NumericT NUM_PI(3.14159265358979323846);
312 
313  for (unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num)
314  {
315  for (unsigned int p = lcl_id; p < size; p += lcl_sz)
316  {
317  unsigned int v = get_reorder_num(p, bit_size);
318  if (!is_row_major)
319  lcl_input[v] = input[batch_id * stride + p];
320  else
321  lcl_input[v] = input[p * stride + batch_id];
322  }
323 
324  __syncthreads();
325 
326  //performs Cooley-Tukey FFT on local arrayfft
327  for (unsigned int s = 0; s < bit_size; s++)
328  {
329  unsigned int ss = 1 << s;
330  NumericT cs, sn;
331  for (unsigned int tid = lcl_id; tid < size; tid += lcl_sz)
332  {
333  unsigned int group = (tid & (ss - 1));
334  unsigned int pos = ((tid >> s) << (s + 1)) + group;
335 
336  Numeric2T in1 = lcl_input[pos];
337  Numeric2T in2 = lcl_input[pos + ss];
338 
339  NumericT arg = group * sign * NUM_PI / ss;
340 
341  sn = sin(arg);
342  cs = cos(arg);
343  Numeric2T ex;
344  ex.x = cs;
345  ex.y = sn;
346 
347  Numeric2T tmp;
348  tmp.x = in2.x * ex.x - in2.y * ex.y;
349  tmp.y = in2.x * ex.y + in2.y * ex.x;
350 
351  lcl_input[pos + ss] = in1 - tmp;
352  lcl_input[pos] = in1 + tmp;
353  }
354  __syncthreads();
355  }
356 
357  //copy local array back to global memory
358  for (unsigned int p = lcl_id; p < size; p += lcl_sz)
359  {
360  if (!is_row_major)
361  input[batch_id * stride + p] = lcl_input[p]; //index
362  else
363  input[p * stride + batch_id] = lcl_input[p];
364  }
365 
366  }
367 }
368 
369 template<typename Numeric2T, typename NumericT>
370 __global__ void fft_radix2(Numeric2T * input,
371  unsigned int s,
372  unsigned int bit_size,
373  unsigned int size,
374  unsigned int stride,
375  unsigned int batch_num,
376  NumericT sign,
377  bool is_row_major)
378 {
379 
380  unsigned int ss = 1 << s;
381  unsigned int half_size = size >> 1;
382 
383  NumericT cs, sn;
384  const NumericT NUM_PI(3.14159265358979323846);
385 
386  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
387  unsigned int glb_sz = gridDim.x * blockDim.x;
388 
389  for (unsigned int batch_id = 0; batch_id < batch_num; batch_id++)
390  {
391  for (unsigned int tid = glb_id; tid < half_size; tid += glb_sz)
392  {
393  unsigned int group = (tid & (ss - 1));
394  unsigned int pos = ((tid >> s) << (s + 1)) + group;
395  Numeric2T in1;
396  Numeric2T in2;
397  unsigned int offset;
398  if (!is_row_major)
399  {
400  offset = batch_id * stride + pos;
401  in1 = input[offset]; //index
402  in2 = input[offset + ss];//index
403  }
404  else
405  {
406  offset = pos * stride + batch_id;
407  in1 = input[offset]; //index
408  in2 = input[offset + ss * stride];//index
409  }
410 
411  NumericT arg = group * sign * NUM_PI / ss;
412 
413  sn = sin(arg);
414  cs = cos(arg);
415 
416  Numeric2T ex;
417  ex.x = cs;
418  ex.y = sn;
419 
420  Numeric2T tmp;
421  tmp.x = in2.x * ex.x - in2.y * ex.y;
422  tmp.y = in2.x * ex.y + in2.y * ex.x;
423 
424  if (!is_row_major)
425  input[offset + ss] = in1 - tmp; //index
426  else
427  input[offset + ss * stride] = in1 - tmp; //index
428  input[offset] = in1 + tmp; //index
429  }
430  }
431 }
432 
440 template<typename NumericT, unsigned int AlignmentV>
442  vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
444 {
446 
447  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
448 
450  {
451  fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
452  static_cast<unsigned int>(bit_size),
453  static_cast<unsigned int>(size),
454  static_cast<unsigned int>(stride),
455  static_cast<unsigned int>(batch_num),
456  static_cast<NumericT>(sign),
457  static_cast<bool>(data_order));
458  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
459  }
460  else
461  {
462  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
463  static_cast<unsigned int>(bit_size),
464  static_cast<unsigned int>(size),
465  static_cast<unsigned int>(stride),
466  static_cast<unsigned int>(batch_num),
467  static_cast<bool>(data_order));
468  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
469 
470  for (vcl_size_t step = 0; step < bit_size; step++)
471  {
472  fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
473  static_cast<unsigned int>(step),
474  static_cast<unsigned int>(bit_size),
475  static_cast<unsigned int>(size),
476  static_cast<unsigned int>(stride),
477  static_cast<unsigned int>(batch_num),
478  sign,
479  static_cast<bool>(data_order));
480  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
481  }
482  }
483 }
484 
492 template<typename NumericT, unsigned int AlignmentV>
494  vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign = NumericT(-1),
496 {
498 
499  unsigned int bit_size = viennacl::linalg::cuda::detail::fft::num_bits(size);
500 
502  {
503  fft_radix2_local<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
504  static_cast<unsigned int>(bit_size),
505  static_cast<unsigned int>(size),
506  static_cast<unsigned int>(stride),
507  static_cast<unsigned int>(batch_num),
508  sign,
509  static_cast<bool>(data_order));
510  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2_local");
511  }
512  else
513  {
514  fft_reorder<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
515  static_cast<unsigned int>(bit_size),
516  static_cast<unsigned int>(size),
517  static_cast<unsigned int>(stride),
518  static_cast<unsigned int>(batch_num),
519  static_cast<bool>(data_order));
520  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_reorder");
521  for (vcl_size_t step = 0; step < bit_size; step++)
522  {
523  fft_radix2<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
524  static_cast<unsigned int>(step),
525  static_cast<unsigned int>(bit_size),
526  static_cast<unsigned int>(size),
527  static_cast<unsigned int>(stride),
528  static_cast<unsigned int>(batch_num),
529  sign,
530  static_cast<bool>(data_order));
531  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_radix2");
532  }
533  }
534 }
535 
536 template<typename Numeric2T, typename NumericT>
537 __global__ void bluestein_post(Numeric2T * Z, Numeric2T * out, unsigned int size, NumericT sign)
538 {
539  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
540  unsigned int glb_sz =gridDim.x * blockDim.x;
541 
542  unsigned int double_size = size << 1;
543  NumericT sn_a, cs_a;
544  const NumericT NUM_PI(3.14159265358979323846);
545 
546  for (unsigned int i = glb_id; i < size; i += glb_sz)
547  {
548  unsigned int rm = i * i % (double_size);
549  NumericT angle = (NumericT)rm / size * (-NUM_PI);
550 
551  sn_a = sin(angle);
552  cs_a= cos(angle);
553 
554  Numeric2T b_i;
555  b_i.x = cs_a;
556  b_i.y = sn_a;
557  out[i].x = Z[i].x * b_i.x - Z[i].y * b_i.y;
558  out[i].y = Z[i].x * b_i.y + Z[i].y * b_i.x;
559  }
560 }
561 
562 template<typename Numeric2T, typename NumericT>
563 __global__ void bluestein_pre(Numeric2T * input, Numeric2T * A, Numeric2T * B,
564  unsigned int size, unsigned int ext_size, NumericT sign)
565 {
566  unsigned int glb_id = blockIdx.x * blockDim.x + threadIdx.x;
567  unsigned int glb_sz = gridDim.x * blockDim.x;
568 
569  unsigned int double_size = size << 1;
570 
571  NumericT sn_a, cs_a;
572  const NumericT NUM_PI(3.14159265358979323846);
573 
574  for (unsigned int i = glb_id; i < size; i += glb_sz)
575  {
576  unsigned int rm = i * i % (double_size);
577  NumericT angle = (NumericT)rm / size * NUM_PI;
578 
579  sn_a = sin(-angle);
580  cs_a= cos(-angle);
581 
582  Numeric2T a_i;
583  a_i.x =cs_a;
584  a_i.y =sn_a;
585 
586  Numeric2T b_i;
587  b_i.x =cs_a;
588  b_i.y =-sn_a;
589 
590  A[i].x = input[i].x * a_i.x - input[i].y * a_i.y;
591  A[i].y = input[i].x * a_i.y + input[i].y * a_i.x;
592  B[i] = b_i;
593 
594  // very bad instruction, to be fixed
595  if (i)
596  B[ext_size - i] = b_i;
597  }
598 }
599 
600 template<typename NumericT>
601 __global__ void zero2(NumericT * input1, NumericT * input2, unsigned int size)
602 {
603  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
604  {
605  input1[i].x = 0;
606  input1[i].y = 0;
607 
608  input2[i].x = 0;
609  input2[i].y = 0;
610  }
611 }
612 
620 template<typename NumericT, unsigned int AlignmentV>
623 {
625 
626  vcl_size_t size = in.size() >> 1;
628 
632 
633  zero2<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(A)),
634  reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(B)),
635  static_cast<unsigned int>(ext_size));
637 
638  bluestein_pre<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(in)),
639  reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(A)),
640  reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(B)),
641  static_cast<unsigned int>(size),
642  static_cast<unsigned int>(ext_size),
643  NumericT(1));
644  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_pre");
645 
647 
648  bluestein_post<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(Z)),
649  reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(out)),
650  static_cast<unsigned int>(size),
651  NumericT(1));
652  VIENNACL_CUDA_LAST_ERROR_CHECK("bluestein_post");
653 }
654 
655 template<typename NumericT>
656 __global__ void fft_mult_vec(const NumericT * input1,
657  const NumericT * input2,
658  NumericT * output,
659  unsigned int size)
660 {
661  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
662  {
663  NumericT in1 = input1[i];
664  NumericT in2 = input2[i];
665  output[i] = in1 * in2;
666  }
667 }
668 
672 template<typename NumericT, unsigned int AlignmentV>
676 {
678 
679  vcl_size_t size = input1.size() / 2;
680 
681  fft_mult_vec<<<128,128>>>(reinterpret_cast<const numeric2_type *>(detail::cuda_arg<NumericT>(input1)),
682  reinterpret_cast<const numeric2_type *>(detail::cuda_arg<NumericT>(input2)),
683  reinterpret_cast< numeric2_type *>(detail::cuda_arg<NumericT>(output)),
684  static_cast<unsigned int>(size));
685  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_mult_vec");
686 }
687 
688 template<typename Numeric2T, typename NumericT>
689 __global__ void fft_div_vec_scalar(Numeric2T * input1, unsigned int size, NumericT factor)
690 {
691  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x*blockDim.x)
692  input1[i] = input1[i]/factor;
693 }
694 
698 template<typename NumericT, unsigned int AlignmentV>
700 {
702 
703  vcl_size_t size = input.size() >> 1;
704  NumericT norm_factor = static_cast<NumericT>(size);
705  fft_div_vec_scalar<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(input)),
706  static_cast<unsigned int>(size),
707  norm_factor);
708  VIENNACL_CUDA_LAST_ERROR_CHECK("fft_div_vec_scalar");
709 }
710 
711 template<typename NumericT>
712 __global__ void transpose(const NumericT * input,
713  NumericT * output,
714  unsigned int row_num,
715  unsigned int col_num)
716 {
717  unsigned int size = row_num * col_num;
718  for (unsigned int i =blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
719  {
720  unsigned int row = i / col_num;
721  unsigned int col = i - row*col_num;
722  unsigned int new_pos = col * row_num + row;
723  output[new_pos] = input[i];
724  }
725 }
726 
730 template<typename NumericT, unsigned int AlignmentV>
733 {
735 
736  transpose<<<128,128>>>(reinterpret_cast<const numeric2_type *>(detail::cuda_arg<NumericT>(input)),
737  reinterpret_cast< numeric2_type *>(detail::cuda_arg<NumericT>(output)),
738  static_cast<unsigned int>(input.internal_size1()>>1),
739  static_cast<unsigned int>(input.internal_size2()>>1));
740  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose");
741 
742 }
743 
744 template<typename NumericT>
745 __global__ void transpose_inplace(
746  NumericT * input,
747  unsigned int row_num,
748  unsigned int col_num)
749 {
750  unsigned int size = row_num * col_num;
751  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i+= gridDim.x * blockDim.x)
752  {
753  unsigned int row = i / col_num;
754  unsigned int col = i - row*col_num;
755  unsigned int new_pos = col * row_num + row;
756  if (i < new_pos)
757  {
758  NumericT val = input[i];
759  input[i] = input[new_pos];
760  input[new_pos] = val;
761  }
762  }
763 }
764 
768 template<typename NumericT, unsigned int AlignmentV>
770 {
772 
773  transpose_inplace<<<128,128>>>(reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(input)),
774  static_cast<unsigned int>(input.internal_size1()>>1),
775  static_cast<unsigned int>(input.internal_size2() >> 1));
776  VIENNACL_CUDA_LAST_ERROR_CHECK("transpose_inplace");
777 
778 }
779 
780 template<typename RealT,typename ComplexT>
781 __global__ void real_to_complex(const RealT * in, ComplexT * out, unsigned int size)
782 {
783  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
784  {
785  ComplexT val;
786  val.x = in[i];
787  val.y = 0;
788  out[i] = val;
789  }
790 }
791 
795 template<typename NumericT>
798 {
800 
801  real_to_complex<<<128,128>>>(detail::cuda_arg<NumericT>(in),
802  reinterpret_cast<numeric2_type *>(detail::cuda_arg<NumericT>(out)),
803  static_cast<unsigned int>(size));
804  VIENNACL_CUDA_LAST_ERROR_CHECK("real_to_complex");
805 }
806 
807 template<typename ComplexT,typename RealT>
808 __global__ void complex_to_real(const ComplexT * in, RealT * out, unsigned int size)
809 {
810  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < size; i += gridDim.x * blockDim.x)
811  out[i] = in[i].x;
812 }
813 
817 template<typename NumericT>
820 {
822 
823  complex_to_real<<<128,128>>>(reinterpret_cast<const numeric2_type *>(detail::cuda_arg<NumericT>(in)),
824  detail::cuda_arg<NumericT>(out),
825  static_cast<unsigned int>(size));
826  VIENNACL_CUDA_LAST_ERROR_CHECK("complex_to_real");
827 
828 }
829 
830 template<typename NumericT>
831 __global__ void reverse_inplace(NumericT * vec, unsigned int size)
832 {
833  for (unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; i < (size >> 1); i+=gridDim.x * blockDim.x)
834  {
835  NumericT val1 = vec[i];
836  NumericT val2 = vec[size - i - 1];
837  vec[i] = val2;
838  vec[size - i - 1] = val1;
839  }
840 }
841 
845 template<typename NumericT>
847 {
848  vcl_size_t size = in.size();
849  reverse_inplace<<<128,128>>>(detail::cuda_arg<NumericT>(in), static_cast<unsigned int>(size));
850  VIENNACL_CUDA_LAST_ERROR_CHECK("reverse_inplace");
851 }
852 
853 } //namespace cuda
854 } //namespace linalg
855 } //namespace viennacl
856 
857 #endif /* FFT_OPERATIONS_HPP_ */
Helper class for checking whether a matrix has a row-major layout.
Definition: forwards.h:483
Implementation of the dense matrix class.
__global__ void bluestein_post(Numeric2T *Z, Numeric2T *out, unsigned int size, NumericT sign)
Various little tools used here and there in ViennaCL.
__global__ void real_to_complex(const RealT *in, ComplexT *out, unsigned int size)
result_of::size_type< viennacl::vector_base< T > >::type stride(viennacl::vector_base< T > const &s)
Definition: stride.hpp:45
__global__ void fft_direct(const Numeric2T *input, Numeric2T *output, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
This file provides the forward declarations for the main types used within ViennaCL.
endcode *Final step
A dense matrix class.
Definition: forwards.h:374
__global__ void fft_mult_vec(const NumericT *input1, const NumericT *input2, NumericT *output, unsigned int size)
__host__ __device__ float2 operator+(float2 a, float2 b)
vcl_size_t num_bits(vcl_size_t size)
__global__ void reverse_inplace(NumericT *vec, unsigned int size)
__global__ void transpose(const NumericT *input, NumericT *output, unsigned int row_num, unsigned int col_num)
vcl_size_t size(VectorType const &vec)
Generic routine for obtaining the size of a vector (ViennaCL, uBLAS, etc.)
Definition: size.hpp:144
void radix2(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Radix-2 1D algorithm for computing Fourier transformation.
void bluestein(viennacl::vector< NumericT, AlignmentV > &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t)
Bluestein's algorithm for computing Fourier transformation.
__global__ void fft_div_vec_scalar(Numeric2T *input1, unsigned int size, NumericT factor)
__global__ void fft_reorder(NumericT *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, bool is_row_major)
__device__ float2 operator*(float2 in1, float2 in2)
std::size_t vcl_size_t
Definition: forwards.h:74
__device__ float2 operator/(float2 a, SCALARTYPE b)
void normalize(viennacl::vector< NumericT, AlignmentV > &input)
Normalize vector on with his own size.
vector_expression< const matrix_base< NumericT, F >, const unsigned int, op_row > row(const matrix_base< NumericT, F > &A, unsigned int i)
Definition: matrix.hpp:853
void convolve_i(viennacl::vector< SCALARTYPE, ALIGNMENT > &input1, viennacl::vector< SCALARTYPE, ALIGNMENT > &input2, viennacl::vector< SCALARTYPE, ALIGNMENT > &output)
__global__ void zero2(NumericT *input1, NumericT *input2, unsigned int size)
__global__ void fft_radix2(Numeric2T *input, unsigned int s, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Common routines for CUDA execution.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
__device__ unsigned int get_reorder_num(unsigned int v, unsigned int bit_size)
size_type size() const
Returns the length of the vector (cf. std::vector)
Definition: vector_def.hpp:118
__global__ void bluestein_pre(Numeric2T *input, Numeric2T *A, Numeric2T *B, unsigned int size, unsigned int ext_size, NumericT sign)
#define VIENNACL_CUDA_LAST_ERROR_CHECK(message)
Definition: common.hpp:27
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:231
__global__ void transpose_inplace(NumericT *input, unsigned int row_num, unsigned int col_num)
void reverse(viennacl::vector_base< NumericT > &in)
Reverse vector to oposite order and save it in input vector.
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:229
void multiply_complex(viennacl::vector< NumericT, AlignmentV > const &input1, viennacl::vector< NumericT, AlignmentV > const &input2, viennacl::vector< NumericT, AlignmentV > &output)
Mutiply two complex vectors and store result in output.
__global__ void complex_to_real(const ComplexT *in, RealT *out, unsigned int size)
__host__ __device__ float2 operator-(float2 a, float2 b)
void reorder(viennacl::vector< NumericT, AlignmentV > &in, vcl_size_t size, vcl_size_t stride, vcl_size_t bits_datasize, vcl_size_t batch_num, viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
void direct(viennacl::vector< NumericT, AlignmentV > const &in, viennacl::vector< NumericT, AlignmentV > &out, vcl_size_t size, vcl_size_t stride, vcl_size_t batch_num, NumericT sign=NumericT(-1), viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::DATA_ORDER data_order=viennacl::linalg::host_based::detail::fft::FFT_DATA_ORDER::ROW_MAJOR)
Direct 1D algorithm for computing Fourier transformation.
vcl_size_t next_power_2(vcl_size_t n)
__global__ void fft_radix2_local(Numeric2T *input, unsigned int bit_size, unsigned int size, unsigned int stride, unsigned int batch_num, NumericT sign, bool is_row_major)
Implementation of the ViennaCL scalar class.
ScalarType fft(std::vector< ScalarType > &in, std::vector< ScalarType > &out, unsigned int, unsigned int, unsigned int batch_size)
Definition: fft_1d.cpp:719
SCALARTYPE sign(SCALARTYPE val)
Implementations of NMF operations using a plain single-threaded or OpenMP-enabled execution on CPU...