1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_ITERATIVE_HPP
31 template<
typename StringT>
34 source.append(
"__kernel void cg_vector_update( \n");
35 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
36 source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n");
37 source.append(
" __global "); source.append(numeric_string); source.append(
" * p, \n");
38 source.append(
" __global "); source.append(numeric_string); source.append(
" * r, \n");
39 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
40 source.append(
" "); source.append(numeric_string); source.append(
" beta, \n");
41 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
42 source.append(
" unsigned int size, \n");
43 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
44 source.append(
"{ \n");
45 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
46 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
47 source.append(
" "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
48 source.append(
" "); source.append(numeric_string); source.append(
" value_r = r[i]; \n");
50 source.append(
" result[i] += alpha * value_p; \n");
51 source.append(
" value_r -= alpha * Ap[i]; \n");
52 source.append(
" value_p = value_r + beta * value_p; \n");
54 source.append(
" p[i] = value_p; \n");
55 source.append(
" r[i] = value_r; \n");
56 source.append(
" inner_prod_contrib += value_r * value_r; \n");
57 source.append(
" } \n");
60 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
61 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
62 source.append(
" { \n");
63 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
64 source.append(
" if (get_local_id(0) < stride) \n");
65 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
69 source.append(
" if (get_local_id(0) == 0) \n ");
70 source.append(
" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
72 source.append(
"} \n");
76 template<
typename StringT>
79 source.append(
"__kernel void cg_csr_prod( \n");
80 source.append(
" __global const unsigned int * row_indices, \n");
81 source.append(
" __global const unsigned int * column_indices, \n");
82 source.append(
" __global const unsigned int * row_blocks, \n");
83 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
84 source.append(
" unsigned int num_blocks, \n");
85 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
86 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
87 source.append(
" unsigned int size, \n");
88 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
89 source.append(
" unsigned int buffer_size, \n");
90 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
91 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
92 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_elements) \n");
93 source.append(
"{ \n");
95 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
96 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
98 source.append(
" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
99 source.append(
" unsigned int row_start = row_blocks[block_id]; \n");
100 source.append(
" unsigned int row_stop = row_blocks[block_id + 1]; \n");
101 source.append(
" unsigned int rows_to_process = row_stop - row_start; \n");
102 source.append(
" unsigned int element_start = row_indices[row_start]; \n");
103 source.append(
" unsigned int element_stop = row_indices[row_stop]; \n");
105 source.append(
" if (rows_to_process > 1) { \n");
107 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
108 source.append(
" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
110 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
113 source.append(
" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
114 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
115 source.append(
" unsigned int thread_row_start = row_indices[row] - element_start; \n");
116 source.append(
" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
117 source.append(
" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
118 source.append(
" dot_prod += shared_elements[i]; \n");
119 source.append(
" Ap[row] = dot_prod; \n");
120 source.append(
" inner_prod_ApAp += dot_prod * dot_prod; \n");
121 source.append(
" inner_prod_pAp += p[row] * dot_prod; \n");
122 source.append(
" } \n");
123 source.append(
" } \n");
125 source.append(
" else \n");
126 source.append(
" { \n");
128 source.append(
" shared_elements[get_local_id(0)] = 0; \n");
129 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
130 source.append(
" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
133 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
134 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
135 source.append(
" if (get_local_id(0) < stride) \n");
136 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
137 source.append(
" } \n");
139 source.append(
" if (get_local_id(0) == 0) { \n");
140 source.append(
" Ap[row_start] = shared_elements[0]; \n");
141 source.append(
" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
142 source.append(
" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
143 source.append(
" } \n");
144 source.append(
" } \n");
145 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
146 source.append(
" } \n");
149 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
150 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
151 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
152 source.append(
" { \n");
153 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
154 source.append(
" if (get_local_id(0) < stride) { \n");
155 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
156 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
157 source.append(
" } ");
158 source.append(
" } ");
161 source.append(
" if (get_local_id(0) == 0) { \n ");
162 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
163 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
164 source.append(
" } \n");
166 source.append(
"} \n");
171 template<
typename StringT>
174 source.append(
"__kernel void cg_coo_prod( \n");
175 source.append(
" __global const uint2 * coords, \n");
176 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
177 source.append(
" __global const uint * group_boundaries, \n");
178 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
179 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
180 source.append(
" unsigned int size, \n");
181 source.append(
" __local unsigned int * shared_rows, \n");
182 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
183 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
184 source.append(
" unsigned int buffer_size, \n");
185 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
186 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
187 source.append(
"{ \n");
188 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
189 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
192 source.append(
" uint2 tmp; \n");
193 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
194 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
195 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
196 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
198 source.append(
" uint local_index = 0; \n");
200 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
201 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
203 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
204 source.append(
" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
207 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
208 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
209 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
210 source.append(
" else {\n");
211 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
212 source.append(
" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
213 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
214 source.append(
" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
215 source.append(
" } \n");
216 source.append(
" } \n");
219 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
220 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
221 source.append(
" inter_results[get_local_id(0)] = val; \n");
222 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
223 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
225 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
226 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
227 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
228 source.append(
" inter_results[get_local_id(0)] += left; \n");
229 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
230 source.append(
" } \n");
233 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
234 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
235 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
236 source.append(
" Ap[tmp.x] = Ap_entry; \n");
237 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
238 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
239 source.append(
" } \n");
241 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
242 source.append(
" } \n");
244 source.append(
" if (local_index + 1 == group_end) {\n");
245 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
246 source.append(
" Ap[tmp.x] = Ap_entry; \n");
247 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
248 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
249 source.append(
" } \n");
252 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
253 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
254 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
255 source.append(
" { \n");
256 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
257 source.append(
" if (get_local_id(0) < stride) { \n");
258 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
259 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
260 source.append(
" } ");
261 source.append(
" } ");
264 source.append(
" if (get_local_id(0) == 0) { \n ");
265 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
266 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
267 source.append(
" } \n");
269 source.append(
"} \n \n");
274 template<
typename StringT>
277 source.append(
"__kernel void cg_ell_prod( \n");
278 source.append(
" __global const unsigned int * coords, \n");
279 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
280 source.append(
" unsigned int internal_row_num, \n");
281 source.append(
" unsigned int items_per_row, \n");
282 source.append(
" unsigned int aligned_items_per_row, \n");
283 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
284 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
285 source.append(
" unsigned int size, \n");
286 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
287 source.append(
" unsigned int buffer_size, \n");
288 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
289 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
290 source.append(
"{ \n");
291 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
292 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
293 source.append(
" uint glb_id = get_global_id(0); \n");
294 source.append(
" uint glb_sz = get_global_size(0); \n");
296 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
297 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
299 source.append(
" uint offset = row; \n");
300 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
301 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
302 source.append(
" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
303 source.append(
" } \n");
305 source.append(
" Ap[row] = sum; \n");
306 source.append(
" inner_prod_ApAp += sum * sum; \n");
307 source.append(
" inner_prod_pAp += p[row] * sum; \n");
308 source.append(
" } \n");
311 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
312 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
313 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
314 source.append(
" { \n");
315 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
316 source.append(
" if (get_local_id(0) < stride) { \n");
317 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
318 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
319 source.append(
" } ");
320 source.append(
" } ");
323 source.append(
" if (get_local_id(0) == 0) { \n ");
324 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
325 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
326 source.append(
" } \n");
327 source.append(
"} \n \n");
330 template<
typename StringT>
333 source.append(
"__kernel void cg_sliced_ell_prod( \n");
334 source.append(
" __global const unsigned int * columns_per_block, \n");
335 source.append(
" __global const unsigned int * column_indices, \n");
336 source.append(
" __global const unsigned int * block_start, \n");
337 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
338 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
339 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
340 source.append(
" unsigned int size, \n");
341 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
342 source.append(
" unsigned int buffer_size, \n");
343 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
344 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
345 source.append(
"{ \n");
346 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
347 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
348 source.append(
" uint local_id = get_local_id(0); \n");
349 source.append(
" uint local_size = get_local_size(0); \n");
351 source.append(
" for (uint block_idx = get_group_id(0); block_idx <= size / local_size; block_idx += get_num_groups(0)) { \n");
352 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
354 source.append(
" uint row = block_idx * local_size + local_id; \n");
355 source.append(
" uint offset = block_start[block_idx]; \n");
356 source.append(
" uint num_columns = columns_per_block[block_idx]; \n");
357 source.append(
" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
358 source.append(
" uint index = offset + item_id * local_size + local_id; \n");
359 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
360 source.append(
" sum += val ? (p[column_indices[index]] * val) : 0; \n");
361 source.append(
" } \n");
363 source.append(
" if (row < size) {\n");
364 source.append(
" Ap[row] = sum; \n");
365 source.append(
" inner_prod_ApAp += sum * sum; \n");
366 source.append(
" inner_prod_pAp += p[row] * sum; \n");
367 source.append(
" } \n");
368 source.append(
" } \n");
371 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
372 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
373 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
374 source.append(
" { \n");
375 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
376 source.append(
" if (get_local_id(0) < stride) { \n");
377 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
378 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
379 source.append(
" } ");
380 source.append(
" } ");
383 source.append(
" if (get_local_id(0) == 0) { \n ");
384 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
385 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
386 source.append(
" } \n");
387 source.append(
"} \n \n");
390 template<
typename StringT>
393 source.append(
"__kernel void cg_hyb_prod( \n");
394 source.append(
" const __global int* ell_coords, \n");
395 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
396 source.append(
" const __global uint* csr_rows, \n");
397 source.append(
" const __global uint* csr_cols, \n");
398 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
399 source.append(
" unsigned int internal_row_num, \n");
400 source.append(
" unsigned int items_per_row, \n");
401 source.append(
" unsigned int aligned_items_per_row, \n");
402 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
403 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
404 source.append(
" unsigned int size, \n");
405 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
406 source.append(
" unsigned int buffer_size, \n");
407 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
408 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
409 source.append(
"{ \n");
410 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
411 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
412 source.append(
" uint glb_id = get_global_id(0); \n");
413 source.append(
" uint glb_sz = get_global_size(0); \n");
415 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
416 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
418 source.append(
" uint offset = row; \n");
419 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
420 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
421 source.append(
" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
422 source.append(
" } \n");
424 source.append(
" uint col_begin = csr_rows[row]; \n");
425 source.append(
" uint col_end = csr_rows[row + 1]; \n");
427 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
428 source.append(
" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
429 source.append(
" } \n");
431 source.append(
" Ap[row] = sum; \n");
432 source.append(
" inner_prod_ApAp += sum * sum; \n");
433 source.append(
" inner_prod_pAp += p[row] * sum; \n");
434 source.append(
" } \n");
437 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
438 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
439 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
440 source.append(
" { \n");
441 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
442 source.append(
" if (get_local_id(0) < stride) { \n");
443 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
444 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
445 source.append(
" } ");
446 source.append(
" } ");
449 source.append(
" if (get_local_id(0) == 0) { \n ");
450 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
451 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
452 source.append(
" } \n");
453 source.append(
"} \n \n");
460 template<
typename StringT>
463 source.append(
"__kernel void bicgstab_update_s( \n");
464 source.append(
" __global "); source.append(numeric_string); source.append(
" * s, \n");
465 source.append(
" __global "); source.append(numeric_string); source.append(
" const * r, \n");
466 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
467 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
468 source.append(
" unsigned int chunk_size, \n");
469 source.append(
" unsigned int chunk_offset, \n");
470 source.append(
" unsigned int size, \n");
471 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array, \n");
472 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_Ap_in_r0) \n");
473 source.append(
"{ \n");
475 source.append(
" "); source.append(numeric_string); source.append(
" alpha = 0; \n");
478 source.append(
" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0)]; \n");
479 source.append(
" shared_array_Ap_in_r0[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + 3 * chunk_size]; \n");
480 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
481 source.append(
" { \n");
482 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
483 source.append(
" if (get_local_id(0) < stride) { \n");
484 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
485 source.append(
" shared_array_Ap_in_r0[get_local_id(0)] += shared_array_Ap_in_r0[get_local_id(0) + stride]; \n");
486 source.append(
" } ");
487 source.append(
" } ");
490 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
491 source.append(
" alpha = shared_array[0] / shared_array_Ap_in_r0[0]; ");
493 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
494 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
495 source.append(
" "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
496 source.append(
" \n");
497 source.append(
" value_s = r[i] - alpha * Ap[i]; \n");
498 source.append(
" inner_prod_contrib += value_s * value_s; \n");
499 source.append(
" \n");
500 source.append(
" s[i] = value_s; \n");
501 source.append(
" } \n");
502 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
505 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
506 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
507 source.append(
" { \n");
508 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
509 source.append(
" if (get_local_id(0) < stride) \n");
510 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
511 source.append(
" } ");
514 source.append(
" if (get_local_id(0) == 0) \n ");
515 source.append(
" inner_prod_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
517 source.append(
"} \n");
523 template<
typename StringT>
526 source.append(
"__kernel void bicgstab_vector_update( \n");
527 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
528 source.append(
" "); source.append(numeric_string); source.append(
" alpha, \n");
529 source.append(
" __global "); source.append(numeric_string); source.append(
" * p, \n");
530 source.append(
" "); source.append(numeric_string); source.append(
" omega, \n");
531 source.append(
" __global "); source.append(numeric_string); source.append(
" const * s, \n");
532 source.append(
" __global "); source.append(numeric_string); source.append(
" * residual, \n");
533 source.append(
" __global "); source.append(numeric_string); source.append(
" const * As, \n");
534 source.append(
" "); source.append(numeric_string); source.append(
" beta, \n");
535 source.append(
" __global "); source.append(numeric_string); source.append(
" const * Ap, \n");
536 source.append(
" __global "); source.append(numeric_string); source.append(
" const * r0star, \n");
537 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
538 source.append(
" unsigned int size, \n");
539 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
540 source.append(
"{ \n");
541 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r_r0star = 0; \n");
542 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
543 source.append(
" "); source.append(numeric_string); source.append(
" value_result = result[i]; \n");
544 source.append(
" "); source.append(numeric_string); source.append(
" value_p = p[i]; \n");
545 source.append(
" "); source.append(numeric_string); source.append(
" value_s = s[i]; \n");
546 source.append(
" "); source.append(numeric_string); source.append(
" value_residual = residual[i]; \n");
547 source.append(
" "); source.append(numeric_string); source.append(
" value_As = As[i]; \n");
548 source.append(
" "); source.append(numeric_string); source.append(
" value_Ap = Ap[i]; \n");
549 source.append(
" "); source.append(numeric_string); source.append(
" value_r0star = r0star[i]; \n");
550 source.append(
" \n");
551 source.append(
" value_result += alpha * value_p + omega * value_s; \n");
552 source.append(
" value_residual = value_s - omega * value_As; \n");
553 source.append(
" value_p = value_residual + beta * (value_p - omega * value_Ap); \n");
554 source.append(
" \n");
555 source.append(
" result[i] = value_result; \n");
556 source.append(
" residual[i] = value_residual; \n");
557 source.append(
" p[i] = value_p; \n");
558 source.append(
" inner_prod_r_r0star += value_residual * value_r0star; \n");
559 source.append(
" } \n");
562 source.append(
" shared_array[get_local_id(0)] = inner_prod_r_r0star; \n");
563 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
564 source.append(
" { \n");
565 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
566 source.append(
" if (get_local_id(0) < stride) \n");
567 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
568 source.append(
" } ");
571 source.append(
" if (get_local_id(0) == 0) \n ");
572 source.append(
" inner_prod_buffer[get_group_id(0)] = shared_array[0]; ");
574 source.append(
"} \n");
578 template<
typename StringT>
581 source.append(
"__kernel void bicgstab_csr_prod( \n");
582 source.append(
" __global const unsigned int * row_indices, \n");
583 source.append(
" __global const unsigned int * column_indices, \n");
584 source.append(
" __global const unsigned int * row_blocks, \n");
585 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
586 source.append(
" unsigned int num_blocks, \n");
587 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
588 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
589 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
590 source.append(
" unsigned int size, \n");
591 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
592 source.append(
" unsigned int buffer_size, \n");
593 source.append(
" unsigned int buffer_offset, \n");
594 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
595 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
596 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
597 source.append(
"{ \n");
598 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_elements[1024]; \n");
599 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
600 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
601 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
603 source.append(
" for (unsigned int block_id = get_group_id(0); block_id < num_blocks; block_id += get_num_groups(0)) { \n");
604 source.append(
" unsigned int row_start = row_blocks[block_id]; \n");
605 source.append(
" unsigned int row_stop = row_blocks[block_id + 1]; \n");
606 source.append(
" unsigned int rows_to_process = row_stop - row_start; \n");
607 source.append(
" unsigned int element_start = row_indices[row_start]; \n");
608 source.append(
" unsigned int element_stop = row_indices[row_stop]; \n");
610 source.append(
" if (rows_to_process > 1) { \n");
612 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
613 source.append(
" shared_elements[i - element_start] = elements[i] * p[column_indices[i]]; \n");
615 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
618 source.append(
" for (unsigned int row = row_start + get_local_id(0); row < row_stop; row += get_local_size(0)) { \n");
619 source.append(
" "); source.append(numeric_string); source.append(
" dot_prod = 0; \n");
620 source.append(
" unsigned int thread_row_start = row_indices[row] - element_start; \n");
621 source.append(
" unsigned int thread_row_stop = row_indices[row + 1] - element_start; \n");
622 source.append(
" for (unsigned int i = thread_row_start; i < thread_row_stop; ++i) \n");
623 source.append(
" dot_prod += shared_elements[i]; \n");
624 source.append(
" Ap[row] = dot_prod; \n");
625 source.append(
" inner_prod_ApAp += dot_prod * dot_prod; \n");
626 source.append(
" inner_prod_pAp += p[row] * dot_prod; \n");
627 source.append(
" inner_prod_r0Ap += r0star[row] * dot_prod; \n");
628 source.append(
" } \n");
629 source.append(
" } \n");
631 source.append(
" else \n");
632 source.append(
" { \n");
634 source.append(
" shared_elements[get_local_id(0)] = 0; \n");
635 source.append(
" for (unsigned int i = element_start + get_local_id(0); i < element_stop; i += get_local_size(0)) \n");
636 source.append(
" shared_elements[get_local_id(0)] += elements[i] * p[column_indices[i]]; \n");
639 source.append(
" for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) { \n");
640 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
641 source.append(
" if (get_local_id(0) < stride) \n");
642 source.append(
" shared_elements[get_local_id(0)] += shared_elements[get_local_id(0) + stride]; \n");
643 source.append(
" } \n");
645 source.append(
" if (get_local_id(0) == 0) { \n");
646 source.append(
" Ap[row_start] = shared_elements[0]; \n");
647 source.append(
" inner_prod_ApAp += shared_elements[0] * shared_elements[0]; \n");
648 source.append(
" inner_prod_pAp += p[row_start] * shared_elements[0]; \n");
649 source.append(
" inner_prod_r0Ap += r0star[row_start] * shared_elements[0]; \n");
650 source.append(
" } \n");
651 source.append(
" } \n");
652 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
653 source.append(
" } \n");
656 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
657 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
658 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
659 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
660 source.append(
" { \n");
661 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
662 source.append(
" if (get_local_id(0) < stride) { \n");
663 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
664 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
665 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
666 source.append(
" } ");
667 source.append(
" } ");
670 source.append(
" if (get_local_id(0) == 0) { \n ");
671 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
672 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
673 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
674 source.append(
" } \n");
676 source.append(
"} \n \n");
680 template<
typename StringT>
683 source.append(
"__kernel void bicgstab_coo_prod( \n");
684 source.append(
" __global const uint2 * coords, \n");
685 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
686 source.append(
" __global const uint * group_boundaries, \n");
687 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
688 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
689 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
690 source.append(
" unsigned int size, \n");
691 source.append(
" __local unsigned int * shared_rows, \n");
692 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
693 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
694 source.append(
" unsigned int buffer_size, \n");
695 source.append(
" unsigned int buffer_offset, \n");
696 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
697 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
698 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
699 source.append(
"{ \n");
700 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
701 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
702 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
705 source.append(
" uint2 tmp; \n");
706 source.append(
" "); source.append(numeric_string); source.append(
" val; \n");
707 source.append(
" uint group_start = group_boundaries[get_group_id(0)]; \n");
708 source.append(
" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
709 source.append(
" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n");
711 source.append(
" uint local_index = 0; \n");
713 source.append(
" for (uint k = 0; k < k_end; ++k) { \n");
714 source.append(
" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
716 source.append(
" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
717 source.append(
" val = (local_index < group_end) ? elements[local_index] * p[tmp.y] : 0; \n");
720 source.append(
" if (get_local_id(0) == 0 && k > 0) { \n");
721 source.append(
" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
722 source.append(
" val += inter_results[get_local_size(0)-1]; \n");
723 source.append(
" else {\n");
724 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_size(0)-1]; \n");
725 source.append(
" Ap[shared_rows[get_local_size(0)-1]] = Ap_entry; \n");
726 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
727 source.append(
" inner_prod_pAp += p[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
728 source.append(
" inner_prod_r0Ap += r0star[shared_rows[get_local_size(0)-1]] * Ap_entry; \n");
729 source.append(
" } \n");
730 source.append(
" } \n");
733 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
734 source.append(
" shared_rows[get_local_id(0)] = tmp.x; \n");
735 source.append(
" inter_results[get_local_id(0)] = val; \n");
736 source.append(
" "); source.append(numeric_string); source.append(
" left = 0; \n");
737 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
739 source.append(
" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
740 source.append(
" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
741 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
742 source.append(
" inter_results[get_local_id(0)] += left; \n");
743 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
744 source.append(
" } \n");
747 source.append(
" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
748 source.append(
" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
749 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
750 source.append(
" Ap[tmp.x] = Ap_entry; \n");
751 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
752 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
753 source.append(
" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
754 source.append(
" } \n");
756 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
757 source.append(
" } \n");
759 source.append(
" if (local_index + 1 == group_end) {\n");
760 source.append(
" "); source.append(numeric_string); source.append(
" Ap_entry = inter_results[get_local_id(0)]; \n");
761 source.append(
" Ap[tmp.x] = Ap_entry; \n");
762 source.append(
" inner_prod_ApAp += Ap_entry * Ap_entry; \n");
763 source.append(
" inner_prod_pAp += p[tmp.x] * Ap_entry; \n");
764 source.append(
" inner_prod_r0Ap += r0star[tmp.x] * Ap_entry; \n");
765 source.append(
" } \n");
768 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
769 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
770 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
771 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
772 source.append(
" { \n");
773 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
774 source.append(
" if (get_local_id(0) < stride) { \n");
775 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
776 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
777 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
778 source.append(
" } ");
779 source.append(
" } ");
782 source.append(
" if (get_local_id(0) == 0) { \n ");
783 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
784 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
785 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
786 source.append(
" } \n");
788 source.append(
"} \n \n");
793 template<
typename StringT>
796 source.append(
"__kernel void bicgstab_ell_prod( \n");
797 source.append(
" __global const unsigned int * coords, \n");
798 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
799 source.append(
" unsigned int internal_row_num, \n");
800 source.append(
" unsigned int items_per_row, \n");
801 source.append(
" unsigned int aligned_items_per_row, \n");
802 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
803 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
804 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
805 source.append(
" unsigned int size, \n");
806 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
807 source.append(
" unsigned int buffer_size, \n");
808 source.append(
" unsigned int buffer_offset, \n");
809 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
810 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
811 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
812 source.append(
"{ \n");
813 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
814 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
815 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
816 source.append(
" uint glb_id = get_global_id(0); \n");
817 source.append(
" uint glb_sz = get_global_size(0); \n");
819 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
820 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
822 source.append(
" uint offset = row; \n");
823 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
824 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[offset]; \n");
825 source.append(
" sum += val ? p[coords[offset]] * val : ("); source.append(numeric_string); source.append(
")0; \n");
826 source.append(
" } \n");
828 source.append(
" Ap[row] = sum; \n");
829 source.append(
" inner_prod_ApAp += sum * sum; \n");
830 source.append(
" inner_prod_pAp += p[row] * sum; \n");
831 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
832 source.append(
" } \n");
835 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
836 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
837 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
838 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
839 source.append(
" { \n");
840 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
841 source.append(
" if (get_local_id(0) < stride) { \n");
842 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
843 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
844 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
845 source.append(
" } ");
846 source.append(
" } ");
849 source.append(
" if (get_local_id(0) == 0) { \n ");
850 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
851 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
852 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
853 source.append(
" } \n");
854 source.append(
"} \n \n");
857 template<
typename StringT>
860 source.append(
"__kernel void bicgstab_sliced_ell_prod( \n");
861 source.append(
" __global const unsigned int * columns_per_block, \n");
862 source.append(
" __global const unsigned int * column_indices, \n");
863 source.append(
" __global const unsigned int * block_start, \n");
864 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
865 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
866 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
867 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
868 source.append(
" unsigned int size, \n");
869 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
870 source.append(
" unsigned int buffer_size, \n");
871 source.append(
" unsigned int buffer_offset, \n");
872 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
873 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
874 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
875 source.append(
"{ \n");
876 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
877 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
878 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
879 source.append(
" uint local_id = get_local_id(0); \n");
880 source.append(
" uint local_size = get_local_size(0); \n");
882 source.append(
" for (uint block_idx = get_group_id(0); block_idx <= size / local_size; block_idx += get_num_groups(0)) { \n");
883 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
885 source.append(
" uint row = block_idx * local_size + local_id; \n");
886 source.append(
" uint offset = block_start[block_idx]; \n");
887 source.append(
" uint num_columns = columns_per_block[block_idx]; \n");
888 source.append(
" for (uint item_id = 0; item_id < num_columns; item_id++) { \n");
889 source.append(
" uint index = offset + item_id * local_size + local_id; \n");
890 source.append(
" "); source.append(numeric_string); source.append(
" val = elements[index]; \n");
891 source.append(
" sum += val ? (p[column_indices[index]] * val) : 0; \n");
892 source.append(
" } \n");
894 source.append(
" if (row < size) {\n");
895 source.append(
" Ap[row] = sum; \n");
896 source.append(
" inner_prod_ApAp += sum * sum; \n");
897 source.append(
" inner_prod_pAp += p[row] * sum; \n");
898 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
899 source.append(
" } \n");
900 source.append(
" } \n");
903 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
904 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
905 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
906 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
907 source.append(
" { \n");
908 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
909 source.append(
" if (get_local_id(0) < stride) { \n");
910 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
911 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
912 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
913 source.append(
" } ");
914 source.append(
" } ");
917 source.append(
" if (get_local_id(0) == 0) { \n ");
918 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
919 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
920 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
921 source.append(
" } \n");
922 source.append(
"} \n \n");
925 template<
typename StringT>
928 source.append(
"__kernel void bicgstab_hyb_prod( \n");
929 source.append(
" const __global int* ell_coords, \n");
930 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
931 source.append(
" const __global uint* csr_rows, \n");
932 source.append(
" const __global uint* csr_cols, \n");
933 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
934 source.append(
" unsigned int internal_row_num, \n");
935 source.append(
" unsigned int items_per_row, \n");
936 source.append(
" unsigned int aligned_items_per_row, \n");
937 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
938 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
939 source.append(
" __global const "); source.append(numeric_string); source.append(
" * r0star, \n");
940 source.append(
" unsigned int size, \n");
941 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
942 source.append(
" unsigned int buffer_size, \n");
943 source.append(
" unsigned int buffer_offset, \n");
944 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
945 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
946 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_r0Ap) \n");
947 source.append(
"{ \n");
948 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_ApAp = 0; \n");
949 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_pAp = 0; \n");
950 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_r0Ap = 0; \n");
951 source.append(
" uint glb_id = get_global_id(0); \n");
952 source.append(
" uint glb_sz = get_global_size(0); \n");
954 source.append(
" for (uint row = glb_id; row < size; row += glb_sz) { \n");
955 source.append(
" "); source.append(numeric_string); source.append(
" sum = 0; \n");
957 source.append(
" uint offset = row; \n");
958 source.append(
" for (uint item_id = 0; item_id < items_per_row; item_id++, offset += internal_row_num) { \n");
959 source.append(
" "); source.append(numeric_string); source.append(
" val = ell_elements[offset]; \n");
960 source.append(
" sum += val ? (p[ell_coords[offset]] * val) : 0; \n");
961 source.append(
" } \n");
963 source.append(
" uint col_begin = csr_rows[row]; \n");
964 source.append(
" uint col_end = csr_rows[row + 1]; \n");
966 source.append(
" for (uint item_id = col_begin; item_id < col_end; item_id++) { \n");
967 source.append(
" sum += (p[csr_cols[item_id]] * csr_elements[item_id]); \n");
968 source.append(
" } \n");
970 source.append(
" Ap[row] = sum; \n");
971 source.append(
" inner_prod_ApAp += sum * sum; \n");
972 source.append(
" inner_prod_pAp += p[row] * sum; \n");
973 source.append(
" inner_prod_r0Ap += r0star[row] * sum; \n");
974 source.append(
" } \n");
977 source.append(
" shared_array_ApAp[get_local_id(0)] = inner_prod_ApAp; \n");
978 source.append(
" shared_array_pAp[get_local_id(0)] = inner_prod_pAp; \n");
979 source.append(
" shared_array_r0Ap[get_local_id(0)] = inner_prod_r0Ap; \n");
980 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
981 source.append(
" { \n");
982 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
983 source.append(
" if (get_local_id(0) < stride) { \n");
984 source.append(
" shared_array_ApAp[get_local_id(0)] += shared_array_ApAp[get_local_id(0) + stride]; \n");
985 source.append(
" shared_array_pAp[get_local_id(0)] += shared_array_pAp[get_local_id(0) + stride]; \n");
986 source.append(
" shared_array_r0Ap[get_local_id(0)] += shared_array_r0Ap[get_local_id(0) + stride]; \n");
987 source.append(
" } ");
988 source.append(
" } ");
991 source.append(
" if (get_local_id(0) == 0) { \n ");
992 source.append(
" inner_prod_buffer[ buffer_size + get_group_id(0)] = shared_array_ApAp[0]; \n");
993 source.append(
" inner_prod_buffer[2*buffer_size + get_group_id(0)] = shared_array_pAp[0]; \n");
994 source.append(
" inner_prod_buffer[buffer_offset + get_group_id(0)] = shared_array_r0Ap[0]; \n");
995 source.append(
" } \n");
996 source.append(
"} \n \n");
1002 template <
typename StringType>
1005 source.append(
"__kernel void gmres_gram_schmidt_1( \n");
1006 source.append(
" __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
1007 source.append(
" unsigned int size, \n");
1008 source.append(
" unsigned int internal_size, \n");
1009 source.append(
" unsigned int k, \n");
1010 source.append(
" __global "); source.append(numeric_string); source.append(
" * vi_in_vk_buffer, \n");
1011 source.append(
" unsigned int chunk_size) \n");
1012 source.append(
"{ \n");
1014 source.append(
" __local "); source.append(numeric_string); source.append(
" shared_array[7*128]; \n");
1017 source.append(
" "); source.append(numeric_string); source.append(
" vi_in_vk[7]; \n");
1019 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
1021 source.append(
" unsigned int k_base = 0; \n");
1022 source.append(
" while (k_base < k) { \n");
1023 source.append(
" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1027 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1028 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = 0; \n");
1032 source.append(
" vi_in_vk[0] = 0;\n");
1033 source.append(
" vi_in_vk[1] = 0;\n");
1034 source.append(
" vi_in_vk[2] = 0;\n");
1035 source.append(
" vi_in_vk[3] = 0;\n");
1036 source.append(
" vi_in_vk[4] = 0;\n");
1037 source.append(
" vi_in_vk[5] = 0;\n");
1038 source.append(
" vi_in_vk[6] = 0;\n");
1040 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1041 source.append(
" value_vk = krylov_basis[i + k * internal_size]; \n");
1042 source.append(
" \n");
1043 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1045 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1047 source.append(
" vi_in_vk[j] += value_vk * krylov_basis[i + (k_base + j) * internal_size]; \n");
1048 source.append(
" } \n");
1053 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1054 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk[j]; \n");
1056 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1057 source.append(
" { \n");
1058 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1059 source.append(
" if (get_local_id(0) < stride) { \n");
1060 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1061 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1062 source.append(
" } ");
1063 source.append(
" } ");
1066 source.append(
" if (get_local_id(0) == 0) \n ");
1067 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1068 source.append(
" vi_in_vk_buffer[get_group_id(0) + (k_base + j) * chunk_size] = shared_array[j*chunk_size]; ");
1070 source.append(
" k_base += vecs_in_iteration; \n");
1071 source.append(
" } \n");
1073 source.append(
"} \n");
1077 template <
typename StringType>
1080 source.append(
"__kernel void gmres_gram_schmidt_2( \n");
1081 source.append(
" __global "); source.append(numeric_string); source.append(
" * krylov_basis, \n");
1082 source.append(
" unsigned int size, \n");
1083 source.append(
" unsigned int internal_size, \n");
1084 source.append(
" unsigned int k, \n");
1085 source.append(
" __global "); source.append(numeric_string); source.append(
" const * vi_in_vk_buffer, \n");
1086 source.append(
" unsigned int chunk_size, \n");
1087 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
1088 source.append(
" unsigned int krylov_dim, \n");
1089 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1090 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
1091 source.append(
"{ \n");
1093 source.append(
" "); source.append(numeric_string); source.append(
" vk_dot_vk = 0; \n");
1094 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = 0; \n");
1096 source.append(
" unsigned int k_base = 0; \n");
1097 source.append(
" while (k_base < k) { \n");
1098 source.append(
" unsigned int vecs_in_iteration = (k - k_base > 7) ? 7 : (k - k_base); \n");
1101 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1102 source.append(
" shared_array[get_local_id(0) + j*chunk_size] = vi_in_vk_buffer[get_local_id(0) + (k_base + j) * chunk_size]; \n");
1103 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1104 source.append(
" { \n");
1105 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1106 source.append(
" if (get_local_id(0) < stride) { \n");
1107 source.append(
" for (uint j=0; j<vecs_in_iteration; ++j) \n");
1108 source.append(
" shared_array[get_local_id(0) + j*chunk_size] += shared_array[get_local_id(0) + j*chunk_size + stride]; \n");
1109 source.append(
" } ");
1110 source.append(
" } ");
1111 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1114 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1115 source.append(
" value_vk = krylov_basis[i + k * internal_size]; \n");
1116 source.append(
" \n");
1117 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1118 source.append(
" value_vk -= shared_array[j*chunk_size] * krylov_basis[i + (k_base + j) * internal_size]; \n");
1119 source.append(
" vk_dot_vk += (k_base + vecs_in_iteration == k) ? (value_vk * value_vk) : 0; \n");
1120 source.append(
" krylov_basis[i + k * internal_size] = value_vk; \n");
1121 source.append(
" } \n");
1124 source.append(
" if (get_group_id(0) == 0) \n");
1125 source.append(
" for (unsigned int j=0; j<vecs_in_iteration; ++j) \n");
1126 source.append(
" R_buffer[(k_base + j) + k*krylov_dim] = shared_array[j*chunk_size]; ");
1127 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1129 source.append(
" k_base += vecs_in_iteration; \n");
1130 source.append(
" } \n");
1133 source.append(
" shared_array[get_local_id(0)] = vk_dot_vk; \n");
1134 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1135 source.append(
" { \n");
1136 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1137 source.append(
" if (get_local_id(0) < stride) \n");
1138 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1139 source.append(
" } ");
1142 source.append(
" if (get_local_id(0) == 0) \n ");
1143 source.append(
" inner_prod_buffer[chunk_size+get_group_id(0)] = shared_array[0]; ");
1145 source.append(
"} \n");
1148 template <
typename StringType>
1151 source.append(
"__kernel void gmres_normalize_vk( \n");
1152 source.append(
" __global "); source.append(numeric_string); source.append(
" * vk, \n");
1153 source.append(
" unsigned int vk_offset, \n");
1154 source.append(
" __global "); source.append(numeric_string); source.append(
" const * residual, \n");
1155 source.append(
" __global "); source.append(numeric_string); source.append(
" * R_buffer, \n");
1156 source.append(
" unsigned int R_offset, \n");
1157 source.append(
" __global "); source.append(numeric_string); source.append(
" const * inner_prod_buffer, \n");
1158 source.append(
" unsigned int chunk_size, \n");
1159 source.append(
" __global "); source.append(numeric_string); source.append(
" * r_dot_vk_buffer, \n");
1160 source.append(
" unsigned int chunk_offset, \n");
1161 source.append(
" unsigned int size, \n");
1162 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array) \n");
1163 source.append(
"{ \n");
1165 source.append(
" "); source.append(numeric_string); source.append(
" norm_vk = 0; \n");
1168 source.append(
" shared_array[get_local_id(0)] = inner_prod_buffer[get_local_id(0) + chunk_size]; \n");
1169 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1170 source.append(
" { \n");
1171 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1172 source.append(
" if (get_local_id(0) < stride) \n");
1173 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1174 source.append(
" } ");
1177 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1178 source.append(
" norm_vk = sqrt(shared_array[0]); \n");
1180 source.append(
" "); source.append(numeric_string); source.append(
" inner_prod_contrib = 0; \n");
1181 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1182 source.append(
" "); source.append(numeric_string); source.append(
" value_vk = vk[i + vk_offset] / norm_vk; \n");
1183 source.append(
" \n");
1184 source.append(
" inner_prod_contrib += residual[i] * value_vk; \n");
1185 source.append(
" \n");
1186 source.append(
" vk[i + vk_offset] = value_vk; \n");
1187 source.append(
" } \n");
1188 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1191 source.append(
" shared_array[get_local_id(0)] = inner_prod_contrib; \n");
1192 source.append(
" for (uint stride=get_local_size(0)/2; stride > 0; stride /= 2) \n");
1193 source.append(
" { \n");
1194 source.append(
" barrier(CLK_LOCAL_MEM_FENCE); \n");
1195 source.append(
" if (get_local_id(0) < stride) \n");
1196 source.append(
" shared_array[get_local_id(0)] += shared_array[get_local_id(0) + stride]; \n");
1197 source.append(
" } ");
1200 source.append(
" if (get_local_id(0) == 0) \n ");
1201 source.append(
" r_dot_vk_buffer[get_group_id(0) + chunk_offset] = shared_array[0]; ");
1202 source.append(
" if (get_global_id(0) == 0) \n ");
1203 source.append(
" R_buffer[R_offset] = norm_vk; \n");
1205 source.append(
"} \n");
1209 template <
typename StringType>
1212 source.append(
"__kernel void gmres_update_result( \n");
1213 source.append(
" __global "); source.append(numeric_string); source.append(
" * result, \n");
1214 source.append(
" __global "); source.append(numeric_string); source.append(
" const * residual, \n");
1215 source.append(
" __global "); source.append(numeric_string); source.append(
" const * krylov_basis, \n");
1216 source.append(
" unsigned int size, \n");
1217 source.append(
" unsigned int internal_size, \n");
1218 source.append(
" __global "); source.append(numeric_string); source.append(
" const * coefficients, \n");
1219 source.append(
" unsigned int k) \n");
1220 source.append(
"{ \n");
1222 source.append(
" for (unsigned int i = get_global_id(0); i < size; i += get_global_size(0)) { \n");
1223 source.append(
" "); source.append(numeric_string); source.append(
" value_result = result[i] + coefficients[0] * residual[i]; \n");
1224 source.append(
" \n");
1225 source.append(
" for (unsigned int j = 1; j < k; ++j) \n");
1226 source.append(
" value_result += coefficients[j] * krylov_basis[i + (j-1)*internal_size]; \n");
1227 source.append(
" \n");
1228 source.append(
" result[i] = value_result; \n");
1229 source.append(
" } \n");
1231 source.append(
"} \n");
1235 template <
typename StringType>
1238 source.append(
"__kernel void gmres_csr_prod( \n");
1239 source.append(
" __global const unsigned int * row_indices, \n");
1240 source.append(
" __global const unsigned int * column_indices, \n");
1241 source.append(
" __global const unsigned int * row_blocks, \n");
1242 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1243 source.append(
" unsigned int num_blocks, \n");
1244 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1245 source.append(
" unsigned int offset_p, \n");
1246 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1247 source.append(
" unsigned int offset_Ap, \n");
1248 source.append(
" unsigned int size, \n");
1249 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1250 source.append(
" unsigned int buffer_size, \n");
1251 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1252 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp, \n");
1253 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_elements) \n");
1254 source.append(
"{ \n");
1255 source.append(
" cg_csr_prod(row_indices, column_indices, row_blocks, elements, num_blocks, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp, shared_elements); \n");
1256 source.append(
"} \n \n");
1260 template <
typename StringType>
1263 source.append(
"__kernel void gmres_coo_prod( \n");
1264 source.append(
" __global const uint2 * coords, \n");
1265 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1266 source.append(
" __global const uint * group_boundaries, \n");
1267 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1268 source.append(
" unsigned int offset_p, \n");
1269 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1270 source.append(
" unsigned int offset_Ap, \n");
1271 source.append(
" unsigned int size, \n");
1272 source.append(
" __local unsigned int * shared_rows, \n");
1273 source.append(
" __local "); source.append(numeric_string); source.append(
" * inter_results, \n");
1274 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1275 source.append(
" unsigned int buffer_size, \n");
1276 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1277 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1278 source.append(
"{ \n");
1279 source.append(
" cg_coo_prod(coords, elements, group_boundaries, p + offset_p, Ap + offset_Ap, size, shared_rows, inter_results, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1280 source.append(
"} \n \n");
1285 template <
typename StringType>
1288 source.append(
"__kernel void gmres_ell_prod( \n");
1289 source.append(
" __global const unsigned int * coords, \n");
1290 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1291 source.append(
" unsigned int internal_row_num, \n");
1292 source.append(
" unsigned int items_per_row, \n");
1293 source.append(
" unsigned int aligned_items_per_row, \n");
1294 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1295 source.append(
" unsigned int offset_p, \n");
1296 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1297 source.append(
" unsigned int offset_Ap, \n");
1298 source.append(
" unsigned int size, \n");
1299 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1300 source.append(
" unsigned int buffer_size, \n");
1301 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1302 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1303 source.append(
"{ \n");
1304 source.append(
" cg_ell_prod(coords, elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1305 source.append(
"} \n \n");
1308 template <
typename StringType>
1311 source.append(
"__kernel void gmres_sliced_ell_prod( \n");
1312 source.append(
" __global const unsigned int * columns_per_block, \n");
1313 source.append(
" __global const unsigned int * column_indices, \n");
1314 source.append(
" __global const unsigned int * block_start, \n");
1315 source.append(
" __global const "); source.append(numeric_string); source.append(
" * elements, \n");
1316 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1317 source.append(
" unsigned int offset_p, \n");
1318 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1319 source.append(
" unsigned int offset_Ap, \n");
1320 source.append(
" unsigned int size, \n");
1321 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1322 source.append(
" unsigned int buffer_size, \n");
1323 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1324 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1325 source.append(
"{ \n");
1326 source.append(
" cg_sliced_ell_prod(columns_per_block, column_indices, block_start, elements, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1327 source.append(
"} \n \n");
1330 template <
typename StringType>
1333 source.append(
"__kernel void gmres_hyb_prod( \n");
1334 source.append(
" const __global int* ell_coords, \n");
1335 source.append(
" const __global "); source.append(numeric_string); source.append(
"* ell_elements, \n");
1336 source.append(
" const __global uint* csr_rows, \n");
1337 source.append(
" const __global uint* csr_cols, \n");
1338 source.append(
" const __global "); source.append(numeric_string); source.append(
"* csr_elements, \n");
1339 source.append(
" unsigned int internal_row_num, \n");
1340 source.append(
" unsigned int items_per_row, \n");
1341 source.append(
" unsigned int aligned_items_per_row, \n");
1342 source.append(
" __global const "); source.append(numeric_string); source.append(
" * p, \n");
1343 source.append(
" unsigned int offset_p, \n");
1344 source.append(
" __global "); source.append(numeric_string); source.append(
" * Ap, \n");
1345 source.append(
" unsigned int offset_Ap, \n");
1346 source.append(
" unsigned int size, \n");
1347 source.append(
" __global "); source.append(numeric_string); source.append(
" * inner_prod_buffer, \n");
1348 source.append(
" unsigned int buffer_size, \n");
1349 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_ApAp, \n");
1350 source.append(
" __local "); source.append(numeric_string); source.append(
" * shared_array_pAp) \n");
1351 source.append(
"{ \n");
1352 source.append(
" cg_hyb_prod(ell_coords, ell_elements, csr_rows, csr_cols, csr_elements, internal_row_num, items_per_row, aligned_items_per_row, p + offset_p, Ap + offset_Ap, size, inner_prod_buffer, buffer_size, shared_array_ApAp, shared_array_pAp); \n");
1353 source.append(
"} \n \n");
1363 template<
typename NumericT>
1373 static std::map<cl_context, bool> init_done;
1380 source.reserve(1024);
1382 viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
1410 #ifdef VIENNACL_BUILD_INFO
1411 std::cout <<
"Creating program " << prog_name << std::endl;
1413 ctx.add_program(source, prog_name);
1414 init_done[ctx.handle().get()] =
true;
Main kernel class for generating specialized OpenCL kernels for fast iterative solvers.
void generate_sliced_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
static void init(viennacl::ocl::context &ctx)
Some helper routines for reading/writing/printing scheduler expressions.
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Provides OpenCL-related utilities.
void generate_sliced_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
static std::string program_name()
void generate_pipelined_gmres_gram_schmidt_stage1(StringType &source, std::string const &numeric_string, bool is_nvidia)
void generate_ell_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
void generate_hyb_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_normalize_vk(StringType &source, std::string const &numeric_string)
void generate_pipelined_gmres_gram_schmidt_stage2(StringType &source, std::string const &numeric_string)
void generate_compressed_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
static void apply(viennacl::ocl::context const &)
void generate_ell_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
const OCL_TYPE & get() const
void generate_compressed_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_vector_update(StringT &source, std::string const &numeric_string)
void generate_coordinate_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_coordinate_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Provides the datastructures for dealing with a single statement such as 'x = y + z;'.
Proxy classes for vectors.
void generate_hyb_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
void generate_hyb_matrix_pipelined_cg_prod(StringT &source, std::string const &numeric_string)
void generate_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_pipelined_bicgstab_update_s(StringT &source, std::string const &numeric_string)
void generate_pipelined_cg_vector_update(StringT &source, std::string const &numeric_string)
void generate_pipelined_gmres_update_result(StringType &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
void generate_compressed_matrix_pipelined_bicgstab_prod(StringT &source, std::string const &numeric_string)
void generate_sliced_ell_matrix_pipelined_gmres_prod(StringType &source, std::string const &numeric_string)