1 #ifndef VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
2 #define VIENNACL_LINALG_CUDA_BISECT_KERNEL_LARGE_HPP_
51 template<
typename NumericT>
53 void writeToGmem(
const unsigned int tid,
const unsigned int tid_2,
54 const unsigned int num_threads_active,
55 const unsigned int num_blocks_mult,
56 NumericT *g_left_one, NumericT *g_right_one,
57 unsigned int *g_pos_one,
58 NumericT *g_left_mult, NumericT *g_right_mult,
59 unsigned int *g_left_count_mult,
60 unsigned int *g_right_count_mult,
61 NumericT *s_left, NumericT *s_right,
62 unsigned short *s_left_count,
unsigned short *s_right_count,
63 unsigned int *g_blocks_mult,
64 unsigned int *g_blocks_mult_sum,
65 unsigned short *s_compaction_list,
66 unsigned short *s_cl_helper,
67 unsigned int offset_mult_lambda
71 if (tid < offset_mult_lambda)
74 g_left_one[tid] = s_left[tid];
75 g_right_one[tid] = s_right[tid];
77 g_pos_one[tid] = s_right_count[tid];
83 g_left_mult[tid - offset_mult_lambda] = s_left[tid];
84 g_right_mult[tid - offset_mult_lambda] = s_right[tid];
85 g_left_count_mult[tid - offset_mult_lambda] = s_left_count[tid];
86 g_right_count_mult[tid - offset_mult_lambda] = s_right_count[tid];
89 if (tid_2 < num_threads_active)
92 if (tid_2 < offset_mult_lambda)
95 g_left_one[tid_2] = s_left[tid_2];
96 g_right_one[tid_2] = s_right[tid_2];
98 g_pos_one[tid_2] = s_right_count[tid_2];
103 g_left_mult[tid_2 - offset_mult_lambda] = s_left[tid_2];
104 g_right_mult[tid_2 - offset_mult_lambda] = s_right[tid_2];
105 g_left_count_mult[tid_2 - offset_mult_lambda] = s_left_count[tid_2];
106 g_right_count_mult[tid_2 - offset_mult_lambda] = s_right_count[tid_2];
115 if (tid <= num_blocks_mult)
117 g_blocks_mult[tid] = s_compaction_list[tid];
118 g_blocks_mult_sum[tid] = s_cl_helper[tid];
121 if (tid_2 <= num_blocks_mult)
123 g_blocks_mult[tid_2] = s_compaction_list[tid_2];
124 g_blocks_mult_sum[tid_2] = s_cl_helper[tid_2];
131 template<
typename NumericT>
135 const unsigned int num_threads_active,
136 unsigned int &offset_mult_lambda,
137 NumericT *s_left, NumericT *s_right,
138 unsigned short *s_left_count,
unsigned short *s_right_count,
139 unsigned short *s_cl_one,
unsigned short *s_cl_mult,
140 unsigned short *s_cl_blocking,
unsigned short *s_cl_helper,
141 unsigned int is_one_lambda,
unsigned int is_one_lambda_2,
142 NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2,
143 unsigned int &left_count,
unsigned int &right_count,
144 unsigned int &left_count_2,
unsigned int &right_count_2,
145 unsigned int c_block_iend,
unsigned int c_sum_block,
146 unsigned int c_block_iend_2,
unsigned int c_sum_block_2
151 right = s_right[tid];
153 if (tid_2 < num_threads_active)
156 left_2 = s_left[tid_2];
157 right_2 = s_right[tid_2];
164 unsigned int ptr_w = 0;
165 unsigned int ptr_w_2 = 0;
166 unsigned int ptr_blocking_w = 0;
167 unsigned int ptr_blocking_w_2 = 0;
171 ptr_w = (1 == is_one_lambda) ? s_cl_one[tid]
172 : s_cl_mult[tid] + offset_mult_lambda;
174 if (0 != c_block_iend)
176 ptr_blocking_w = s_cl_blocking[tid];
179 if (tid_2 < num_threads_active)
181 ptr_w_2 = (1 == is_one_lambda_2) ? s_cl_one[tid_2]
182 : s_cl_mult[tid_2] + offset_mult_lambda;
184 if (0 != c_block_iend_2)
186 ptr_blocking_w_2 = s_cl_blocking[tid_2];
194 s_left[ptr_w] = left;
195 s_right[ptr_w] = right;
196 s_left_count[ptr_w] = left_count;
197 s_right_count[ptr_w] = right_count;
204 s_left[ptr_w] = left;
205 s_right[ptr_w] = right;
206 s_left_count[ptr_w] = left_count;
207 s_right_count[ptr_w] = right_count;
209 if (0 != c_block_iend)
211 s_cl_blocking[ptr_blocking_w + 1] = c_block_iend - 1;
212 s_cl_helper[ptr_blocking_w + 1] = c_sum_block;
215 if (tid_2 < num_threads_active)
219 s_left[ptr_w_2] = left_2;
220 s_right[ptr_w_2] = right_2;
221 s_left_count[ptr_w_2] = left_count_2;
222 s_right_count[ptr_w_2] = right_count_2;
224 if (0 != c_block_iend_2)
226 s_cl_blocking[ptr_blocking_w_2 + 1] = c_block_iend_2 - 1;
227 s_cl_helper[ptr_blocking_w_2 + 1] = c_sum_block_2;
239 const unsigned int num_threads_compaction,
240 unsigned short *s_cl_blocking,
241 unsigned short *s_cl_helper
246 s_cl_blocking[tid] = s_cl_helper[tid];
248 if (tid_2 < num_threads_compaction)
250 s_cl_blocking[tid_2] = s_cl_helper[tid_2];
260 unsigned int offset = 1;
263 for (
int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
271 unsigned int ai = offset*(2*tid+1)-1;
272 unsigned int bi = offset*(2*tid+2)-1;
273 s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
280 for (
int d = 2; d < num_threads_compaction; d <<= 1)
290 unsigned int ai = offset*(tid+1) - 1;
291 unsigned int bi = ai + (offset >> 1);
292 s_cl_blocking[bi] = s_cl_blocking[bi] + s_cl_blocking[ai];
304 const unsigned int num_threads_active,
305 const unsigned int num_threads_compaction,
306 unsigned short *s_cl_blocking,
307 unsigned short *s_cl_helper)
309 unsigned int offset = 1;
313 for (
int d = num_threads_compaction >> 1; d > 0; d >>= 1)
321 unsigned int ai = offset*(2*tid+1)-1;
322 unsigned int bi = offset*(2*tid+2)-1;
324 s_cl_blocking[bi] += s_cl_blocking[ai];
332 for (
int d = 2; d < (num_threads_compaction - 1); d <<= 1)
341 unsigned int ai = offset*(tid+1) - 1;
342 unsigned int bi = ai + (offset >> 1);
344 s_cl_blocking[bi] += s_cl_blocking[ai];
356 s_cl_helper[num_threads_active - 1] =
357 s_cl_helper[num_threads_compaction - 1];
358 s_cl_blocking[num_threads_active - 1] =
359 s_cl_blocking[num_threads_compaction - 1];
370 const unsigned int num_threads_active,
371 const unsigned int num_threads_compaction,
372 unsigned short *s_cl_one,
unsigned short *s_cl_mult,
373 unsigned short *s_cl_blocking,
unsigned short *s_cl_helper
382 unsigned int offset = 1;
385 for (
int d = (num_threads_compaction >> 1); d > 0; d >>= 1)
393 unsigned int ai = offset*(2*tid+1);
394 unsigned int bi = offset*(2*tid+2)-1;
396 s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai - 1];
397 s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai - 1];
405 if ((s_cl_helper[ai - 1] != 1) || (s_cl_helper[bi] != 1))
409 if (s_cl_helper[ai - 1] == 1)
414 else if (s_cl_helper[bi] == 1)
417 s_cl_helper[ai - 1] = 1;
422 unsigned int temp = s_cl_blocking[bi] + s_cl_blocking[ai - 1];
428 s_cl_helper[ai - 1] = 1;
434 s_cl_blocking[bi] = temp;
435 s_cl_blocking[ai - 1] = 0;
448 for (
int d = 2; d < num_threads_compaction; d <<= 1)
458 unsigned int ai = offset*(tid+1) - 1;
459 unsigned int bi = ai + (offset >> 1);
461 s_cl_one[bi] = s_cl_one[bi] + s_cl_one[ai];
462 s_cl_mult[bi] = s_cl_mult[bi] + s_cl_mult[ai];
472 template<
typename NumericT>
476 const unsigned int num_threads_active,
477 NumericT *s_left, NumericT *s_right,
478 unsigned short *s_left_count,
479 unsigned short *s_right_count,
480 NumericT left, NumericT mid, NumericT right,
481 const unsigned short left_count,
482 const unsigned short mid_count,
483 const unsigned short right_count,
485 unsigned int &compact_second_chunk,
486 unsigned short *s_compaction_list,
487 unsigned int &is_active_second)
490 if ((left_count != mid_count) && (mid_count != right_count))
493 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
494 left, mid, left_count, mid_count, epsilon);
496 is_active_second = 1;
497 s_compaction_list[threadIdx.x] = 1;
498 compact_second_chunk = 1;
506 is_active_second = 0;
507 s_compaction_list[threadIdx.x] = 0;
510 if (left_count != mid_count)
512 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
513 left, mid, left_count, mid_count, epsilon);
517 storeInterval(addr, s_left, s_right, s_left_count, s_right_count,
518 mid, right, mid_count, right_count, epsilon);
533 template<
typename NumericT>
537 const NumericT lg,
const NumericT ug,
538 const unsigned int lg_eig_count,
539 const unsigned int ug_eig_count,
541 unsigned int *g_num_one,
542 unsigned int *g_num_blocks_mult,
543 NumericT *g_left_one, NumericT *g_right_one,
544 unsigned int *g_pos_one,
545 NumericT *g_left_mult, NumericT *g_right_mult,
546 unsigned int *g_left_count_mult,
547 unsigned int *g_right_count_mult,
548 unsigned int *g_blocks_mult,
549 unsigned int *g_blocks_mult_sum
552 const unsigned int tid = threadIdx.x;
570 __shared__
unsigned int compact_second_chunk;
572 __shared__
unsigned int all_threads_converged;
575 __shared__
unsigned int num_threads_active;
578 __shared__
unsigned int num_threads_compaction;
581 unsigned short *s_compaction_list_exc = s_compaction_list + 1;
586 NumericT left = 0.0f;
587 NumericT right = 0.0f;
588 unsigned int left_count = 0;
589 unsigned int right_count = 0;
593 unsigned int mid_count = 0;
595 unsigned int is_active_second = 0;
598 s_compaction_list[tid] = 0;
601 s_left_count[tid] = 0;
602 s_right_count[tid] = 0;
612 s_left_count[0] = lg_eig_count;
613 s_right_count[0] = ug_eig_count;
615 compact_second_chunk = 0;
616 num_threads_active = 1;
618 num_threads_compaction = 1;
620 all_threads_converged = 1;
629 for(
unsigned int i = 0; i < 15; ++i )
631 s_compaction_list[tid] = 0;
636 left, right, left_count, right_count,
637 mid, all_threads_converged);
642 if (1 == all_threads_converged)
670 if (tid < num_threads_active)
679 s_left_count, s_right_count,
681 left_count, mid_count, right_count,
682 epsilon, compact_second_chunk,
683 s_compaction_list_exc,
694 s_left_count[tid] = left_count;
695 s_right_count[tid] = right_count;
697 is_active_second = 0;
707 if (compact_second_chunk > 0)
715 if (compact_second_chunk > 0)
718 mid, right, mid_count, right_count,
719 s_compaction_list, num_threads_active,
730 num_threads_active += s_compaction_list[num_threads_active];
731 num_threads_compaction =
ceilPow2(num_threads_active);
733 compact_second_chunk = 0;
734 all_threads_converged = 1;
739 if (num_threads_compaction > blockDim.x)
754 unsigned int left_count_2;
755 unsigned int right_count_2;
757 unsigned int tid_2 = tid + blockDim.x;
761 left_count = s_left_count[tid];
762 right_count = s_right_count[tid];
765 if (tid_2 < num_threads_active)
767 left_count_2 = s_left_count[tid_2];
768 right_count_2 = s_right_count[tid_2];
773 unsigned short *s_cl_one = s_left_count + 1;
774 unsigned short *s_cl_mult = s_right_count + 1;
778 unsigned short *s_cl_blocking = s_compaction_list_exc;
786 s_right_count[0] = 0;
793 unsigned int is_one_lambda = 0;
794 unsigned int is_one_lambda_2 = 0;
797 unsigned int multiplicity = right_count - left_count;
798 is_one_lambda = (1 == multiplicity);
800 s_cl_one[tid] = is_one_lambda;
801 s_cl_mult[tid] = (! is_one_lambda);
804 s_cl_blocking[tid] = (1 == is_one_lambda) ? 0 : multiplicity;
805 s_cl_helper[tid] = 0;
807 if (tid_2 < num_threads_active)
810 unsigned int multiplicity = right_count_2 - left_count_2;
811 is_one_lambda_2 = (1 == multiplicity);
813 s_cl_one[tid_2] = is_one_lambda_2;
814 s_cl_mult[tid_2] = (! is_one_lambda_2);
817 s_cl_blocking[tid_2] = (1 == is_one_lambda_2) ? 0 : multiplicity;
818 s_cl_helper[tid_2] = 0;
824 s_cl_blocking[tid_2] = 0;
825 s_cl_helper[tid_2] = 0;
829 scanInitial(tid, tid_2, num_threads_active, num_threads_compaction,
830 s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper);
835 num_threads_compaction, s_cl_blocking, s_cl_helper);
840 unsigned int c_block_iend = 0;
841 unsigned int c_block_iend_2 = 0;
842 unsigned int c_sum_block = 0;
843 unsigned int c_sum_block_2 = 0;
850 if (1 == s_cl_helper[tid])
853 c_block_iend = s_cl_mult[tid] + 1;
854 c_sum_block = s_cl_blocking[tid];
857 if (1 == s_cl_helper[tid_2])
860 c_block_iend_2 = s_cl_mult[tid_2] + 1;
861 c_sum_block_2 = s_cl_blocking[tid_2];
865 s_cl_blocking, s_cl_helper);
872 __shared__
unsigned int num_blocks_mult;
873 __shared__
unsigned int num_mult;
874 __shared__
unsigned int offset_mult_lambda;
879 num_blocks_mult = s_cl_blocking[num_threads_active - 1];
880 offset_mult_lambda = s_cl_one[num_threads_active - 1];
881 num_mult = s_cl_mult[num_threads_active - 1];
883 *g_num_one = offset_mult_lambda;
884 *g_num_blocks_mult = num_blocks_mult;
889 NumericT left_2, right_2;
896 s_left, s_right, s_left_count, s_right_count,
897 s_cl_one, s_cl_mult, s_cl_blocking, s_cl_helper,
898 is_one_lambda, is_one_lambda_2,
899 left, right, left_2, right_2,
900 left_count, right_count, left_count_2, right_count_2,
901 c_block_iend, c_sum_block, c_block_iend_2, c_sum_block_2
909 s_cl_blocking[num_blocks_mult] = num_mult;
916 writeToGmem(tid, tid_2, num_threads_active, num_blocks_mult,
917 g_left_one, g_right_one, g_pos_one,
918 g_left_mult, g_right_mult, g_left_count_mult, g_right_count_mult,
919 s_left, s_right, s_left_count, s_right_count,
920 g_blocks_mult, g_blocks_mult_sum,
921 s_compaction_list, s_cl_helper, offset_mult_lambda);
927 #endif // #ifndef _BISECT_KERNEL_LARGE_H_
__device__ void writeToGmem(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum, unsigned short *s_compaction_list, unsigned short *s_cl_helper, unsigned int offset_mult_lambda)
Write data to global memory.
__device__ void createIndicesCompaction(T *s_compaction_list_exc, unsigned int num_threads_compaction)
#define VIENNACL_BISECT_MAX_THREADS_BLOCK
__device__ void scanInitial(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
__device__ int ceilPow2(int n)
__global__ void bisectKernelLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT lg, const NumericT ug, const unsigned int lg_eig_count, const unsigned int ug_eig_count, NumericT epsilon, unsigned int *g_num_one, unsigned int *g_num_blocks_mult, NumericT *g_left_one, NumericT *g_right_one, unsigned int *g_pos_one, NumericT *g_left_mult, NumericT *g_right_mult, unsigned int *g_left_count_mult, unsigned int *g_right_count_mult, unsigned int *g_blocks_mult, unsigned int *g_blocks_mult_sum)
Bisection to find eigenvalues of a real, symmetric, and tridiagonal matrix g_d diagonal elements in g...
__device__ void scanCompactBlocksStartAddress(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Compute addresses to obtain compact list of block start addresses.
Global configuration parameters.
__device__ void storeNonEmptyIntervalsLarge(unsigned int addr, const unsigned int num_threads_active, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, NumericT left, NumericT mid, NumericT right, const unsigned short left_count, const unsigned short mid_count, const unsigned short right_count, NumericT epsilon, unsigned int &compact_second_chunk, unsigned short *s_compaction_list, unsigned int &is_active_second)
__device__ void compactStreamsFinal(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, unsigned int &offset_mult_lambda, NumericT *s_left, NumericT *s_right, unsigned short *s_left_count, unsigned short *s_right_count, unsigned short *s_cl_one, unsigned short *s_cl_mult, unsigned short *s_cl_blocking, unsigned short *s_cl_helper, unsigned int is_one_lambda, unsigned int is_one_lambda_2, NumericT &left, NumericT &right, NumericT &left_2, NumericT &right_2, unsigned int &left_count, unsigned int &right_count, unsigned int &left_count_2, unsigned int &right_count_2, unsigned int c_block_iend, unsigned int c_sum_block, unsigned int c_block_iend_2, unsigned int c_sum_block_2)
Perform final stream compaction before writing data to global memory.
__device__ void compactIntervals(NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT mid, NumericT right, unsigned int mid_count, unsigned int right_count, T *s_compaction_list, unsigned int num_threads_active, unsigned int is_active_second)
Perform stream compaction for second child intervals.
__device__ void storeInterval(unsigned int addr, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, NumericT left, NumericT right, S left_count, S right_count, NumericT precision)
__device__ void scanSumBlocks(const unsigned int tid, const unsigned int tid_2, const unsigned int num_threads_active, const unsigned int num_threads_compaction, unsigned short *s_cl_blocking, unsigned short *s_cl_helper)
Perform scan to obtain number of eigenvalues before a specific block.
__device__ void subdivideActiveInterval(const unsigned int tid, NumericT *s_left, NumericT *s_right, T *s_left_count, T *s_right_count, const unsigned int num_threads_active, NumericT &left, NumericT &right, unsigned int &left_count, unsigned int &right_count, NumericT &mid, unsigned int &all_threads_converged)
Subdivide interval if active and not already converged.
__device__ unsigned int computeNumSmallerEigenvalsLarge(const NumericT *g_d, const NumericT *g_s, const unsigned int n, const NumericT x, const unsigned int tid, const unsigned int num_intervals_active, NumericT *s_d, NumericT *s_s, unsigned int converged)