ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
coordinate_matrix.hpp
Go to the documentation of this file.
1 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
2 #define VIENNACL_LINALG_OPENCL_KERNELS_COORDINATE_MATRIX_HPP
3 
7 #include "viennacl/ocl/utils.hpp"
8 
10 
13 namespace viennacl
14 {
15 namespace linalg
16 {
17 namespace opencl
18 {
19 namespace kernels
20 {
21 
23 
24 template<typename StringT>
25 void generate_coordinate_matrix_vec_mul(StringT & source, std::string const & numeric_string)
26 {
27  source.append("__kernel void vec_mul( \n");
28  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
29  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
30  source.append(" __global const uint * group_boundaries, \n");
31  source.append(" __global const "); source.append(numeric_string); source.append(" * x, \n");
32  source.append(" uint4 layout_x, \n");
33  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
34  source.append(" uint4 layout_result, \n");
35  source.append(" __local unsigned int * shared_rows, \n");
36  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
37  source.append("{ \n");
38  source.append(" uint2 tmp; \n");
39  source.append(" "); source.append(numeric_string); source.append(" val; \n");
40  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
41  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
42  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
43 
44  source.append(" uint local_index = 0; \n");
45 
46  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
47  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
48 
49  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
50  source.append(" val = (local_index < group_end) ? elements[local_index] * x[tmp.y * layout_x.y + layout_x.x] : 0; \n");
51 
52  //check for carry from previous loop run:
53  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
54  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
55  source.append(" val += inter_results[get_local_size(0)-1]; \n");
56  source.append(" else \n");
57  source.append(" result[shared_rows[get_local_size(0)-1] * layout_result.y + layout_result.x] = inter_results[get_local_size(0)-1]; \n");
58  source.append(" } \n");
59 
60  //segmented parallel reduction begin
61  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
62  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
63  source.append(" inter_results[get_local_id(0)] = val; \n");
64  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
65  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
66 
67  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
68  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
69  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
70  source.append(" inter_results[get_local_id(0)] += left; \n");
71  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
72  source.append(" } \n");
73  //segmented parallel reduction end
74 
75  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
76  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
77  source.append(" result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
78  source.append(" } \n");
79 
80  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
81  source.append(" } \n"); //for k
82 
83  source.append(" if (local_index + 1 == group_end) \n"); //write results of last active entry (this may not necessarily be the case already)
84  source.append(" result[tmp.x * layout_result.y + layout_result.x] = inter_results[get_local_id(0)]; \n");
85  source.append("} \n");
86 
87 }
88 
89 namespace detail
90 {
92  template<typename StringT>
93  void generate_coordinate_matrix_dense_matrix_mul(StringT & source, std::string const & numeric_string,
94  bool B_transposed, bool B_row_major, bool C_row_major)
95  {
96  source.append("__kernel void ");
97  source.append(viennacl::linalg::opencl::detail::sparse_dense_matmult_kernel_name(B_transposed, B_row_major, C_row_major));
98  source.append("( \n");
99  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
100  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
101  source.append(" __global const uint * group_boundaries, \n");
102  source.append(" __global const "); source.append(numeric_string); source.append(" * d_mat, \n");
103  source.append(" unsigned int d_mat_row_start, \n");
104  source.append(" unsigned int d_mat_col_start, \n");
105  source.append(" unsigned int d_mat_row_inc, \n");
106  source.append(" unsigned int d_mat_col_inc, \n");
107  source.append(" unsigned int d_mat_row_size, \n");
108  source.append(" unsigned int d_mat_col_size, \n");
109  source.append(" unsigned int d_mat_internal_rows, \n");
110  source.append(" unsigned int d_mat_internal_cols, \n");
111  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
112  source.append(" unsigned int result_row_start, \n");
113  source.append(" unsigned int result_col_start, \n");
114  source.append(" unsigned int result_row_inc, \n");
115  source.append(" unsigned int result_col_inc, \n");
116  source.append(" unsigned int result_row_size, \n");
117  source.append(" unsigned int result_col_size, \n");
118  source.append(" unsigned int result_internal_rows, \n");
119  source.append(" unsigned int result_internal_cols, \n");
120  source.append(" __local unsigned int * shared_rows, \n");
121  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
122  source.append("{ \n");
123  source.append(" uint2 tmp; \n");
124  source.append(" "); source.append(numeric_string); source.append(" val; \n");
125  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
126  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
127  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : 0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
128 
129  source.append(" uint local_index = 0; \n");
130 
131  source.append(" for (uint result_col = 0; result_col < result_col_size; ++result_col) { \n");
132  source.append(" for (uint k = 0; k < k_end; ++k) { \n");
133  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
134 
135  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
136  if (B_transposed && B_row_major)
137  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + tmp.y * d_mat_col_inc ] : 0; \n");
138  if (B_transposed && !B_row_major)
139  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + result_col * d_mat_row_inc) + (d_mat_col_start + tmp.y * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
140  else if (!B_transposed && B_row_major)
141  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) * d_mat_internal_cols + d_mat_col_start + result_col * d_mat_col_inc ] : 0; \n");
142  else
143  source.append(" val = (local_index < group_end) ? elements[local_index] * d_mat[ (d_mat_row_start + tmp.y * d_mat_row_inc) + (d_mat_col_start + result_col * d_mat_col_inc) * d_mat_internal_rows ] : 0; \n");
144 
145  //check for carry from previous loop run:
146  source.append(" if (get_local_id(0) == 0 && k > 0) { \n");
147  source.append(" if (tmp.x == shared_rows[get_local_size(0)-1]) \n");
148  source.append(" val += inter_results[get_local_size(0)-1]; \n");
149  source.append(" else \n");
150  if (C_row_major)
151  source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_size(0)-1]; \n");
152  else
153  source.append(" result[(shared_rows[get_local_size(0)-1] * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_size(0)-1]; \n");
154  source.append(" } \n");
155 
156  //segmented parallel reduction begin
157  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
158  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
159  source.append(" inter_results[get_local_id(0)] = val; \n");
160  source.append(" "); source.append(numeric_string); source.append(" left = 0; \n");
161  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
162 
163  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) { \n");
164  source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : 0; \n");
165  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
166  source.append(" inter_results[get_local_id(0)] += left; \n");
167  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
168  source.append(" } \n");
169  //segmented parallel reduction end
170 
171  source.append(" if (local_index < group_end && get_local_id(0) < get_local_size(0) - 1 && \n");
172  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1]) { \n");
173  if (C_row_major)
174  source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
175  else
176  source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
177  source.append(" } \n");
178 
179  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
180  source.append(" } \n"); //for k
181 
182  source.append(" if (local_index + 1 == group_end) \n"); //write results of last active entry (this may not necessarily be the case already)
183  if (C_row_major)
184  source.append(" result[(tmp.x * result_row_inc + result_row_start) * result_internal_cols + result_col_start + result_col * result_col_inc ] = inter_results[get_local_id(0)]; \n");
185  else
186  source.append(" result[(tmp.x * result_row_inc + result_row_start) + (result_col_start + result_col * result_col_inc) * result_internal_rows ] = inter_results[get_local_id(0)]; \n");
187  source.append(" } \n"); //for result_col
188  source.append("} \n");
189 
190  }
191 }
192 
193 template<typename StringT>
194 void generate_coordinate_matrix_dense_matrix_multiplication(StringT & source, std::string const & numeric_string)
195 {
196  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, false);
197  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, false, true);
198  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, false);
199  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, false, true, true);
200 
201  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, false);
202  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, false, true);
203  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, false);
204  detail::generate_coordinate_matrix_dense_matrix_mul(source, numeric_string, true, true, true);
205 }
206 
207 template<typename StringT>
208 void generate_coordinate_matrix_row_info_extractor(StringT & source, std::string const & numeric_string)
209 {
210  source.append("__kernel void row_info_extractor( \n");
211  source.append(" __global const uint2 * coords, \n");//(row_index, column_index)
212  source.append(" __global const "); source.append(numeric_string); source.append(" * elements, \n");
213  source.append(" __global const uint * group_boundaries, \n");
214  source.append(" __global "); source.append(numeric_string); source.append(" * result, \n");
215  source.append(" unsigned int option, \n");
216  source.append(" __local unsigned int * shared_rows, \n");
217  source.append(" __local "); source.append(numeric_string); source.append(" * inter_results) \n");
218  source.append("{ \n");
219  source.append(" uint2 tmp; \n");
220  source.append(" "); source.append(numeric_string); source.append(" val; \n");
221  source.append(" uint last_index = get_local_size(0) - 1; \n");
222  source.append(" uint group_start = group_boundaries[get_group_id(0)]; \n");
223  source.append(" uint group_end = group_boundaries[get_group_id(0) + 1]; \n");
224  source.append(" uint k_end = (group_end > group_start) ? 1 + (group_end - group_start - 1) / get_local_size(0) : ("); source.append(numeric_string); source.append(")0; \n"); // -1 in order to have correct behavior if group_end - group_start == j * get_local_size(0)
225 
226  source.append(" uint local_index = 0; \n");
227 
228  source.append(" for (uint k = 0; k < k_end; ++k) \n");
229  source.append(" { \n");
230  source.append(" local_index = group_start + k * get_local_size(0) + get_local_id(0); \n");
231 
232  source.append(" tmp = (local_index < group_end) ? coords[local_index] : (uint2) 0; \n");
233  source.append(" val = (local_index < group_end && (option != 3 || tmp.x == tmp.y) ) ? elements[local_index] : 0; \n");
234 
235  //check for carry from previous loop run:
236  source.append(" if (get_local_id(0) == 0 && k > 0) \n");
237  source.append(" { \n");
238  source.append(" if (tmp.x == shared_rows[last_index]) \n");
239  source.append(" { \n");
240  source.append(" switch (option) \n");
241  source.append(" { \n");
242  source.append(" case 0: \n"); //inf-norm
243  source.append(" case 3: \n"); //diagonal entry
244  source.append(" val = max(val, fabs(inter_results[last_index])); \n");
245  source.append(" break; \n");
246 
247  source.append(" case 1: \n"); //1-norm
248  source.append(" val = fabs(val) + inter_results[last_index]; \n");
249  source.append(" break; \n");
250 
251  source.append(" case 2: \n"); //2-norm
252  source.append(" val = sqrt(val * val + inter_results[last_index]); \n");
253  source.append(" break; \n");
254 
255  source.append(" default: \n");
256  source.append(" break; \n");
257  source.append(" } \n");
258  source.append(" } \n");
259  source.append(" else \n");
260  source.append(" { \n");
261  source.append(" switch (option) \n");
262  source.append(" { \n");
263  source.append(" case 0: \n"); //inf-norm
264  source.append(" case 1: \n"); //1-norm
265  source.append(" case 3: \n"); //diagonal entry
266  source.append(" result[shared_rows[last_index]] = inter_results[last_index]; \n");
267  source.append(" break; \n");
268 
269  source.append(" case 2: \n"); //2-norm
270  source.append(" result[shared_rows[last_index]] = sqrt(inter_results[last_index]); \n");
271  source.append(" default: \n");
272  source.append(" break; \n");
273  source.append(" } \n");
274  source.append(" } \n");
275  source.append(" } \n");
276 
277  //segmented parallel reduction begin
278  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
279  source.append(" shared_rows[get_local_id(0)] = tmp.x; \n");
280  source.append(" switch (option) \n");
281  source.append(" { \n");
282  source.append(" case 0: \n");
283  source.append(" case 3: \n");
284  source.append(" inter_results[get_local_id(0)] = val; \n");
285  source.append(" break; \n");
286  source.append(" case 1: \n");
287  source.append(" inter_results[get_local_id(0)] = fabs(val); \n");
288  source.append(" break; \n");
289  source.append(" case 2: \n");
290  source.append(" inter_results[get_local_id(0)] = val * val; \n");
291  source.append(" default: \n");
292  source.append(" break; \n");
293  source.append(" } \n");
294  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
295 
296  source.append(" for (unsigned int stride = 1; stride < get_local_size(0); stride *= 2) \n");
297  source.append(" { \n");
298  source.append(" "); source.append(numeric_string); source.append(" left = (get_local_id(0) >= stride && tmp.x == shared_rows[get_local_id(0) - stride]) ? inter_results[get_local_id(0) - stride] : ("); source.append(numeric_string); source.append(")0; \n");
299  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
300  source.append(" switch (option) \n");
301  source.append(" { \n");
302  source.append(" case 0: \n"); //inf-norm
303  source.append(" case 3: \n"); //diagonal entry
304  source.append(" inter_results[get_local_id(0)] = max(inter_results[get_local_id(0)], left); \n");
305  source.append(" break; \n");
306 
307  source.append(" case 1: \n"); //1-norm
308  source.append(" inter_results[get_local_id(0)] += left; \n");
309  source.append(" break; \n");
310 
311  source.append(" case 2: \n"); //2-norm
312  source.append(" inter_results[get_local_id(0)] += left; \n");
313  source.append(" break; \n");
314 
315  source.append(" default: \n");
316  source.append(" break; \n");
317  source.append(" } \n");
318  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
319  source.append(" } \n");
320  //segmented parallel reduction end
321 
322  source.append(" if (get_local_id(0) != last_index && \n");
323  source.append(" shared_rows[get_local_id(0)] != shared_rows[get_local_id(0) + 1] && \n");
324  source.append(" inter_results[get_local_id(0)] != 0) \n");
325  source.append(" { \n");
326  source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
327  source.append(" } \n");
328 
329  source.append(" barrier(CLK_LOCAL_MEM_FENCE); \n");
330  source.append(" } \n"); //for k
331 
332  source.append(" if (local_index + 1 == group_end && inter_results[get_local_id(0)] != 0) \n");
333  source.append(" result[tmp.x] = (option == 2) ? sqrt(inter_results[get_local_id(0)]) : inter_results[get_local_id(0)]; \n");
334  source.append("} \n");
335 }
336 
338 
339 // main kernel class
341 template<typename NumericT>
343 {
344  static std::string program_name()
345  {
346  return viennacl::ocl::type_to_string<NumericT>::apply() + "_coordinate_matrix";
347  }
348 
349  static void init(viennacl::ocl::context & ctx)
350  {
351  static std::map<cl_context, bool> init_done;
352  if (!init_done[ctx.handle().get()])
353  {
355  std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
356 
357  std::string source;
358  source.reserve(1024);
359 
360  viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
361 
362  generate_coordinate_matrix_vec_mul(source, numeric_string);
364  generate_coordinate_matrix_row_info_extractor(source, numeric_string);
365 
366  std::string prog_name = program_name();
367  #ifdef VIENNACL_BUILD_INFO
368  std::cout << "Creating program " << prog_name << std::endl;
369  #endif
370  ctx.add_program(source, prog_name);
371  init_done[ctx.handle().get()] = true;
372  } //if
373  } //init
374 };
375 
376 } // namespace kernels
377 } // namespace opencl
378 } // namespace linalg
379 } // namespace viennacl
380 #endif
381 
static void init(viennacl::ocl::context &ctx)
Implements a OpenCL platform within ViennaCL.
Various little tools used here and there in ViennaCL.
std::string sparse_dense_matmult_kernel_name(bool B_transposed, bool B_row_major, bool C_row_major)
Returns the OpenCL kernel string for the operation C = A * B with A sparse, B, C dense matrices...
Definition: common.hpp:70
Manages an OpenCL context and provides the respective convenience functions for creating buffers...
Definition: context.hpp:54
Main kernel class for generating OpenCL kernels for coordinate_matrix.
Provides OpenCL-related utilities.
void generate_coordinate_matrix_dense_matrix_mul(StringT &source, std::string const &numeric_string, bool B_transposed, bool B_row_major, bool C_row_major)
Generate kernel for C = A * B with A being a compressed_matrix, B and C dense.
const viennacl::ocl::handle< cl_context > & handle() const
Returns the context handle.
Definition: context.hpp:613
Common implementations shared by OpenCL-based operations.
static void apply(viennacl::ocl::context const &)
Definition: utils.hpp:40
const OCL_TYPE & get() const
Definition: handle.hpp:189
void generate_coordinate_matrix_dense_matrix_multiplication(StringT &source, std::string const &numeric_string)
Representation of an OpenCL kernel in ViennaCL.
void generate_coordinate_matrix_vec_mul(StringT &source, std::string const &numeric_string)
Helper class for converting a type to its string representation.
Definition: utils.hpp:57
void generate_coordinate_matrix_row_info_extractor(StringT &source, std::string const &numeric_string)