ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/opencl/kernels/vector.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
00002 #define VIENNACL_LINALG_OPENCL_KERNELS_VECTOR_HPP
00003 
00004 #include "viennacl/tools/tools.hpp"
00005 #include "viennacl/ocl/kernel.hpp"
00006 #include "viennacl/ocl/platform.hpp"
00007 #include "viennacl/ocl/utils.hpp"
00008 
00011 namespace viennacl
00012 {
00013   namespace linalg
00014   {
00015     namespace opencl
00016     {
00017       namespace kernels
00018       {
00019 
00021 
00023         enum avbv_scalar_type
00024         {
00025           VIENNACL_AVBV_NONE = 0, // vector does not exist/contribute
00026           VIENNACL_AVBV_CPU,
00027           VIENNACL_AVBV_GPU
00028         };
00029 
00031         struct avbv_config
00032         {
00033           avbv_config() : with_stride_and_range(true), a(VIENNACL_AVBV_CPU), b(VIENNACL_AVBV_NONE) {}
00034 
00035           bool with_stride_and_range;
00036           std::string      assign_op;
00037           avbv_scalar_type a;
00038           avbv_scalar_type b;
00039         };
00040 
00041         // just returns the for-loop
00042         template <typename StringType>
00043         void generate_avbv_impl2(StringType & source, std::string const & /*numeric_string*/, avbv_config const & cfg, bool mult_alpha, bool mult_beta)
00044         {
00045           source.append("    for (unsigned int i = get_global_id(0); i < size1.z; i += get_global_size(0)) \n");
00046           if (cfg.with_stride_and_range)
00047           {
00048             source.append("      vec1[i*size1.y+size1.x] "); source.append(cfg.assign_op); source.append(" vec2[i*size2.y+size2.x] ");
00049             if (mult_alpha)
00050               source.append("* alpha ");
00051             else
00052               source.append("/ alpha ");
00053             if (cfg.b != VIENNACL_AVBV_NONE)
00054             {
00055               source.append("+ vec3[i*size3.y+size3.x] ");
00056               if (mult_beta)
00057                 source.append("* beta");
00058               else
00059                 source.append("/ beta");
00060             }
00061           }
00062           else
00063           {
00064             source.append("    vec1[i] "); source.append(cfg.assign_op); source.append(" vec2[i] ");
00065             if (mult_alpha)
00066               source.append("* alpha ");
00067             else
00068               source.append("/ alpha ");
00069             if (cfg.b != VIENNACL_AVBV_NONE)
00070             {
00071               source.append("+ vec3[i] ");
00072               if (mult_beta)
00073                 source.append("* beta");
00074               else
00075                 source.append("/ beta");
00076             }
00077           }
00078           source.append("; \n");
00079         }
00080 
00081         template <typename StringType>
00082         void generate_avbv_impl(StringType & source, std::string const & numeric_string, avbv_config const & cfg)
00083         {
00084           source.append("__kernel void av");
00085           if (cfg.b != VIENNACL_AVBV_NONE)
00086             source.append("bv");
00087           if (cfg.assign_op != "=")
00088             source.append("_v");
00089 
00090           if (cfg.a == VIENNACL_AVBV_CPU)
00091             source.append("_cpu");
00092           else if (cfg.a == VIENNACL_AVBV_GPU)
00093             source.append("_gpu");
00094 
00095           if (cfg.b == VIENNACL_AVBV_CPU)
00096             source.append("_cpu");
00097           else if (cfg.b == VIENNACL_AVBV_GPU)
00098             source.append("_gpu");
00099           source.append("( \n");
00100           source.append("  __global "); source.append(numeric_string); source.append(" * vec1, \n");
00101           source.append("  uint4 size1, \n");
00102           source.append(" \n");
00103           if (cfg.a == VIENNACL_AVBV_CPU)
00104           {
00105             source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
00106           }
00107           else if (cfg.a == VIENNACL_AVBV_GPU)
00108           {
00109             source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
00110           }
00111           source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
00112           source.append("  __global const "); source.append(numeric_string); source.append(" * vec2, \n");
00113           source.append("  uint4 size2");
00114 
00115           if (cfg.b != VIENNACL_AVBV_NONE)
00116           {
00117             source.append(", \n\n");
00118             if (cfg.b == VIENNACL_AVBV_CPU)
00119             {
00120               source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
00121             }
00122             else if (cfg.b == VIENNACL_AVBV_GPU)
00123             {
00124               source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
00125             }
00126             source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
00127             source.append("  __global const "); source.append(numeric_string); source.append(" * vec3, \n");
00128             source.append("  uint4 size3 \n");
00129           }
00130           source.append(") { \n");
00131 
00132           if (cfg.a == VIENNACL_AVBV_CPU)
00133           {
00134             source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
00135           }
00136           else if (cfg.a == VIENNACL_AVBV_GPU)
00137           {
00138             source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
00139           }
00140           source.append("  if (options2 & (1 << 0)) \n");
00141           source.append("    alpha = -alpha; \n");
00142           source.append(" \n");
00143 
00144           if (cfg.b == VIENNACL_AVBV_CPU)
00145           {
00146             source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
00147           }
00148           else if (cfg.b == VIENNACL_AVBV_GPU)
00149           {
00150             source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
00151           }
00152           if (cfg.b != VIENNACL_AVBV_NONE)
00153           {
00154             source.append("  if (options3 & (1 << 0)) \n");
00155             source.append("    beta = -beta; \n");
00156             source.append(" \n");
00157           }
00158           source.append("  if (options2 & (1 << 1)) { \n");
00159           if (cfg.b != VIENNACL_AVBV_NONE)
00160           {
00161             source.append("    if (options3 & (1 << 1)) {\n");
00162             generate_avbv_impl2(source, numeric_string, cfg, false, false);
00163             source.append("    } else {\n");
00164             generate_avbv_impl2(source, numeric_string, cfg, false, true);
00165             source.append("    } \n");
00166           }
00167           else
00168             generate_avbv_impl2(source, numeric_string, cfg, false, true);
00169           source.append("  } else { \n");
00170           if (cfg.b != VIENNACL_AVBV_NONE)
00171           {
00172             source.append("    if (options3 & (1 << 1)) {\n");
00173             generate_avbv_impl2(source, numeric_string, cfg, true, false);
00174             source.append("    } else {\n");
00175             generate_avbv_impl2(source, numeric_string, cfg, true, true);
00176             source.append("    } \n");
00177           }
00178           else
00179             generate_avbv_impl2(source, numeric_string, cfg, true, true);
00180           source.append("  } \n");
00181           source.append("} \n");
00182         }
00183 
00184         template <typename StringType>
00185         void generate_avbv(StringType & source, std::string const & numeric_string)
00186         {
00187           avbv_config cfg;
00188           cfg.assign_op = "=";
00189           cfg.with_stride_and_range = true;
00190 
00191           // av
00192           cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
00193           cfg.b = VIENNACL_AVBV_NONE; cfg.a = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
00194 
00195           // avbv
00196           cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
00197           cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
00198           cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
00199           cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
00200 
00201           // avbv
00202           cfg.assign_op = "+=";
00203 
00204           cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
00205           cfg.a = VIENNACL_AVBV_CPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
00206           cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_CPU; generate_avbv_impl(source, numeric_string, cfg);
00207           cfg.a = VIENNACL_AVBV_GPU; cfg.b = VIENNACL_AVBV_GPU; generate_avbv_impl(source, numeric_string, cfg);
00208         }
00209 
00210         template <typename StringType>
00211         void generate_plane_rotation(StringType & source, std::string const & numeric_string)
00212         {
00213           source.append("__kernel void plane_rotation( \n");
00214           source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
00215           source.append("          unsigned int start1, \n");
00216           source.append("          unsigned int inc1, \n");
00217           source.append("          unsigned int size1, \n");
00218           source.append("          __global "); source.append(numeric_string); source.append(" * vec2, \n");
00219           source.append("          unsigned int start2, \n");
00220           source.append("          unsigned int inc2, \n");
00221           source.append("          unsigned int size2, \n");
00222           source.append("          "); source.append(numeric_string); source.append(" alpha, \n");
00223           source.append("          "); source.append(numeric_string); source.append(" beta) \n");
00224           source.append("{ \n");
00225           source.append("  "); source.append(numeric_string); source.append(" tmp1 = 0; \n");
00226           source.append("  "); source.append(numeric_string); source.append(" tmp2 = 0; \n");
00227           source.append(" \n");
00228           source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
00229           source.append(" { \n");
00230           source.append("    tmp1 = vec1[i*inc1+start1]; \n");
00231           source.append("    tmp2 = vec2[i*inc2+start2]; \n");
00232           source.append(" \n");
00233           source.append("    vec1[i*inc1+start1] = alpha * tmp1 + beta * tmp2; \n");
00234           source.append("    vec2[i*inc2+start2] = alpha * tmp2 - beta * tmp1; \n");
00235           source.append("  } \n");
00236           source.append(" \n");
00237           source.append("} \n");
00238         }
00239 
00240         template <typename StringType>
00241         void generate_vector_swap(StringType & source, std::string const & numeric_string)
00242         {
00243           source.append("__kernel void swap( \n");
00244           source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
00245           source.append("          unsigned int start1, \n");
00246           source.append("          unsigned int inc1, \n");
00247           source.append("          unsigned int size1, \n");
00248           source.append("          __global "); source.append(numeric_string); source.append(" * vec2, \n");
00249           source.append("          unsigned int start2, \n");
00250           source.append("          unsigned int inc2, \n");
00251           source.append("          unsigned int size2 \n");
00252           source.append("          ) \n");
00253           source.append("{ \n");
00254           source.append("  "); source.append(numeric_string); source.append(" tmp; \n");
00255           source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
00256           source.append("  { \n");
00257           source.append("    tmp = vec2[i*inc2+start2]; \n");
00258           source.append("    vec2[i*inc2+start2] = vec1[i*inc1+start1]; \n");
00259           source.append("    vec1[i*inc1+start1] = tmp; \n");
00260           source.append("  } \n");
00261           source.append("} \n");
00262         }
00263 
00264         template <typename StringType>
00265         void generate_assign_cpu(StringType & source, std::string const & numeric_string)
00266         {
00267           source.append("__kernel void assign_cpu( \n");
00268           source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
00269           source.append("          unsigned int start1, \n");
00270           source.append("          unsigned int inc1, \n");
00271           source.append("          unsigned int size1, \n");
00272           source.append("          unsigned int internal_size1, \n");
00273           source.append("          "); source.append(numeric_string); source.append(" alpha) \n");
00274           source.append("{ \n");
00275           source.append("  for (unsigned int i = get_global_id(0); i < internal_size1; i += get_global_size(0)) \n");
00276           source.append("    vec1[i*inc1+start1] = (i < size1) ? alpha : 0; \n");
00277           source.append("} \n");
00278 
00279         }
00280 
00281         template <typename StringType>
00282         void generate_inner_prod(StringType & source, std::string const & numeric_string, vcl_size_t vector_num)
00283         {
00284           std::stringstream ss;
00285           ss << vector_num;
00286           std::string vector_num_string = ss.str();
00287 
00288           source.append("__kernel void inner_prod"); source.append(vector_num_string); source.append("( \n");
00289           source.append("          __global const "); source.append(numeric_string); source.append(" * x, \n");
00290           source.append("          uint4 params_x, \n");
00291           for (vcl_size_t i=0; i<vector_num; ++i)
00292           {
00293             ss.str("");
00294             ss << i;
00295             source.append("          __global const "); source.append(numeric_string); source.append(" * y"); source.append(ss.str()); source.append(", \n");
00296             source.append("          uint4 params_y"); source.append(ss.str()); source.append(", \n");
00297           }
00298           source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
00299           source.append("          __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
00300           source.append("{ \n");
00301           source.append("  unsigned int entries_per_thread = (params_x.z - 1) / get_global_size(0) + 1; \n");
00302           source.append("  unsigned int vec_start_index = get_group_id(0) * get_local_size(0) * entries_per_thread; \n");
00303           source.append("  unsigned int vec_stop_index  = min((unsigned int)((get_group_id(0) + 1) * get_local_size(0) * entries_per_thread), params_x.z); \n");
00304 
00305           // compute partial results within group:
00306           for (vcl_size_t i=0; i<vector_num; ++i)
00307           {
00308             ss.str("");
00309             ss << i;
00310             source.append("  "); source.append(numeric_string); source.append(" tmp"); source.append(ss.str()); source.append(" = 0; \n");
00311           }
00312           source.append("  for (unsigned int i = vec_start_index + get_local_id(0); i < vec_stop_index; i += get_local_size(0)) { \n");
00313           source.append("    ");  source.append(numeric_string); source.append(" val_x = x[i*params_x.y + params_x.x]; \n");
00314           for (vcl_size_t i=0; i<vector_num; ++i)
00315           {
00316             ss.str("");
00317             ss << i;
00318             source.append("    tmp"); source.append(ss.str()); source.append(" += val_x * y"); source.append(ss.str()); source.append("[i * params_y"); source.append(ss.str()); source.append(".y + params_y"); source.append(ss.str()); source.append(".x]; \n");
00319           }
00320           source.append("  } \n");
00321           for (vcl_size_t i=0; i<vector_num; ++i)
00322           {
00323             ss.str("");
00324             ss << i;
00325             source.append("  tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] = tmp"); source.append(ss.str()); source.append("; \n");
00326           }
00327 
00328           // now run reduction:
00329           source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
00330           source.append("  { \n");
00331           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00332           source.append("    if (get_local_id(0) < stride) { \n");
00333           for (vcl_size_t i=0; i<vector_num; ++i)
00334           {
00335             ss.str("");
00336             ss << i;
00337             source.append("      tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0)] += tmp_buffer[get_local_id(0) + "); source.append(ss.str()); source.append(" * get_local_size(0) + stride]; \n");
00338           }
00339           source.append("    } \n");
00340           source.append("  } \n");
00341           source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
00342 
00343           source.append("  if (get_local_id(0) == 0) { \n");
00344           for (vcl_size_t i=0; i<vector_num; ++i)
00345           {
00346             ss.str("");
00347             ss << i;
00348             source.append("    group_buffer[get_group_id(0) + "); source.append(ss.str()); source.append(" * get_num_groups(0)] = tmp_buffer["); source.append(ss.str()); source.append(" * get_local_size(0)]; \n");
00349           }
00350           source.append("  } \n");
00351           source.append("} \n");
00352 
00353         }
00354 
00355         template <typename StringType>
00356         void generate_norm(StringType & source, std::string const & numeric_string)
00357         {
00358           bool is_float_or_double = (numeric_string == "float" || numeric_string == "double");
00359 
00360           source.append(numeric_string); source.append(" impl_norm( \n");
00361           source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
00362           source.append("          unsigned int start1, \n");
00363           source.append("          unsigned int inc1, \n");
00364           source.append("          unsigned int size1, \n");
00365           source.append("          unsigned int norm_selector, \n");
00366           source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer) \n");
00367           source.append("{ \n");
00368           source.append("  "); source.append(numeric_string); source.append(" tmp = 0; \n");
00369           source.append("  if (norm_selector == 1) \n"); //norm_1
00370           source.append("  { \n");
00371           source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
00372           if (is_float_or_double)
00373             source.append("      tmp += fabs(vec[i*inc1 + start1]); \n");
00374           else
00375             source.append("      tmp += abs(vec[i*inc1 + start1]); \n");
00376           source.append("  } \n");
00377           source.append("  else if (norm_selector == 2) \n"); //norm_2
00378           source.append("  { \n");
00379           source.append("    "); source.append(numeric_string); source.append(" vec_entry = 0; \n");
00380           source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
00381           source.append("    { \n");
00382           source.append("      vec_entry = vec[i*inc1 + start1]; \n");
00383           source.append("      tmp += vec_entry * vec_entry; \n");
00384           source.append("    } \n");
00385           source.append("  } \n");
00386           source.append("  else if (norm_selector == 0) \n"); //norm_inf
00387           source.append("  { \n");
00388           source.append("    for (unsigned int i = get_local_id(0); i < size1; i += get_local_size(0)) \n");
00389           if (is_float_or_double)
00390             source.append("      tmp = fmax(fabs(vec[i*inc1 + start1]), tmp); \n");
00391           else
00392           {
00393             source.append("      tmp = max(("); source.append(numeric_string); source.append(")abs(vec[i*inc1 + start1]), tmp); \n");
00394           }
00395           source.append("  } \n");
00396 
00397           source.append("  tmp_buffer[get_local_id(0)] = tmp; \n");
00398 
00399           source.append("  if (norm_selector > 0) \n"); //norm_1 or norm_2:
00400           source.append("  { \n");
00401           source.append("    for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
00402           source.append("    { \n");
00403           source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
00404           source.append("      if (get_local_id(0) < stride) \n");
00405           source.append("        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0)+stride]; \n");
00406           source.append("    } \n");
00407           source.append("    return tmp_buffer[0]; \n");
00408           source.append("  } \n");
00409 
00410           //norm_inf:
00411           source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
00412           source.append("  { \n");
00413           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00414           source.append("    if (get_local_id(0) < stride) \n");
00415           if (is_float_or_double)
00416             source.append("      tmp_buffer[get_local_id(0)] = fmax(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
00417           else
00418             source.append("      tmp_buffer[get_local_id(0)] = max(tmp_buffer[get_local_id(0)], tmp_buffer[get_local_id(0)+stride]); \n");
00419           source.append("  } \n");
00420 
00421           source.append("  return tmp_buffer[0]; \n");
00422           source.append("}; \n");
00423 
00424           source.append("__kernel void norm( \n");
00425           source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
00426           source.append("          unsigned int start1, \n");
00427           source.append("          unsigned int inc1, \n");
00428           source.append("          unsigned int size1, \n");
00429           source.append("          unsigned int norm_selector, \n");
00430           source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
00431           source.append("          __global "); source.append(numeric_string); source.append(" * group_buffer) \n");
00432           source.append("{ \n");
00433           source.append("  "); source.append(numeric_string); source.append(" tmp = impl_norm(vec, \n");
00434           source.append("                        (        get_group_id(0)  * size1) / get_num_groups(0) * inc1 + start1, \n");
00435           source.append("                        inc1, \n");
00436           source.append("                        (   (1 + get_group_id(0)) * size1) / get_num_groups(0) \n");
00437           source.append("                      - (        get_group_id(0)  * size1) / get_num_groups(0), \n");
00438           source.append("                        norm_selector, \n");
00439           source.append("                        tmp_buffer); \n");
00440 
00441           source.append("  if (get_local_id(0) == 0) \n");
00442           source.append("    group_buffer[get_group_id(0)] = tmp; \n");
00443           source.append("} \n");
00444 
00445         }
00446 
00447         template <typename StringType>
00448         void generate_inner_prod_sum(StringType & source, std::string const & numeric_string)
00449         {
00450           // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
00451           source.append("__kernel void sum_inner_prod( \n");
00452           source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
00453           source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
00454           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00455           source.append("          unsigned int start_result, \n");
00456           source.append("          unsigned int inc_result) \n");
00457           source.append("{ \n");
00458           source.append("  tmp_buffer[get_local_id(0)] = vec1[get_global_id(0)]; \n");
00459 
00460           source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
00461           source.append("  { \n");
00462           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00463           source.append("    if (get_local_id(0) < stride) \n");
00464           source.append("      tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
00465           source.append("  } \n");
00466           source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
00467 
00468           source.append("  if (get_local_id(0) == 0) \n");
00469           source.append("    result[start_result + inc_result * get_group_id(0)] = tmp_buffer[0]; \n");
00470           source.append("} \n");
00471 
00472         }
00473 
00474         template <typename StringType>
00475         void generate_sum(StringType & source, std::string const & numeric_string)
00476         {
00477           // sums the array 'vec1' and writes to result. Makes use of a single work-group only.
00478           source.append("__kernel void sum( \n");
00479           source.append("          __global "); source.append(numeric_string); source.append(" * vec1, \n");
00480           source.append("          unsigned int start1, \n");
00481           source.append("          unsigned int inc1, \n");
00482           source.append("          unsigned int size1, \n");
00483           source.append("          unsigned int option,  \n"); //0: use fmax, 1: just sum, 2: sum and return sqrt of sum
00484           source.append("          __local "); source.append(numeric_string); source.append(" * tmp_buffer, \n");
00485           source.append("          __global "); source.append(numeric_string); source.append(" * result) \n");
00486           source.append("{ \n");
00487           source.append("  "); source.append(numeric_string); source.append(" thread_sum = 0; \n");
00488           source.append("  "); source.append(numeric_string); source.append(" tmp = 0; \n");
00489           source.append("  for (unsigned int i = get_local_id(0); i<size1; i += get_local_size(0)) \n");
00490           source.append("  { \n");
00491           source.append("    if (option > 0) \n");
00492           source.append("      thread_sum += vec1[i*inc1+start1]; \n");
00493           source.append("    else \n");
00494           source.append("    { \n");
00495           source.append("      tmp = vec1[i*inc1+start1]; \n");
00496           source.append("      tmp = (tmp < 0) ? -tmp : tmp; \n");
00497           source.append("      thread_sum = (thread_sum > tmp) ? thread_sum : tmp; \n");
00498           source.append("    } \n");
00499           source.append("  } \n");
00500 
00501           source.append("  tmp_buffer[get_local_id(0)] = thread_sum; \n");
00502 
00503           source.append("  for (unsigned int stride = get_local_size(0)/2; stride > 0; stride /= 2) \n");
00504           source.append("  { \n");
00505           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00506           source.append("    if (get_local_id(0) < stride) \n");
00507           source.append("    { \n");
00508           source.append("      if (option > 0) \n");
00509           source.append("        tmp_buffer[get_local_id(0)] += tmp_buffer[get_local_id(0) + stride]; \n");
00510           source.append("      else \n");
00511           source.append("        tmp_buffer[get_local_id(0)] = (tmp_buffer[get_local_id(0)] > tmp_buffer[get_local_id(0) + stride]) ? tmp_buffer[get_local_id(0)] : tmp_buffer[get_local_id(0) + stride]; \n");
00512           source.append("    } \n");
00513           source.append("  } \n");
00514           source.append("  barrier(CLK_LOCAL_MEM_FENCE); \n");
00515 
00516           source.append("  if (get_global_id(0) == 0) \n");
00517           source.append("  { \n");
00518           if (numeric_string == "float" || numeric_string == "double")
00519           {
00520             source.append("    if (option == 2) \n");
00521             source.append("      *result = sqrt(tmp_buffer[0]); \n");
00522             source.append("    else \n");
00523           }
00524           source.append("      *result = tmp_buffer[0]; \n");
00525           source.append("  } \n");
00526           source.append("} \n");
00527 
00528         }
00529 
00530         template <typename StringType>
00531         void generate_index_norm_inf(StringType & source, std::string const & numeric_string)
00532         {
00533           //index_norm_inf:
00534           source.append("unsigned int index_norm_inf_impl( \n");
00535           source.append("          __global const "); source.append(numeric_string); source.append(" * vec, \n");
00536           source.append("          unsigned int start1, \n");
00537           source.append("          unsigned int inc1, \n");
00538           source.append("          unsigned int size1, \n");
00539           source.append("          __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
00540           source.append("          __local unsigned int * index_buffer) \n");
00541           source.append("{ \n");
00542           //step 1: fill buffer:
00543           source.append("  "); source.append(numeric_string); source.append(" cur_max = 0; \n");
00544           source.append("  "); source.append(numeric_string); source.append(" tmp; \n");
00545           source.append("  for (unsigned int i = get_global_id(0); i < size1; i += get_global_size(0)) \n");
00546           source.append("  { \n");
00547           if (numeric_string == "float" || numeric_string == "double")
00548             source.append("    tmp = fabs(vec[i*inc1+start1]); \n");
00549           else
00550             source.append("    tmp = abs(vec[i*inc1+start1]); \n");
00551           source.append("    if (cur_max < tmp) \n");
00552           source.append("    { \n");
00553           source.append("      entry_buffer[get_global_id(0)] = tmp; \n");
00554           source.append("      index_buffer[get_global_id(0)] = i; \n");
00555           source.append("      cur_max = tmp; \n");
00556           source.append("    } \n");
00557           source.append("  } \n");
00558 
00559           //step 2: parallel reduction:
00560           source.append("  for (unsigned int stride = get_global_size(0)/2; stride > 0; stride /= 2) \n");
00561           source.append("  { \n");
00562           source.append("    barrier(CLK_LOCAL_MEM_FENCE); \n");
00563           source.append("    if (get_global_id(0) < stride) \n");
00564           source.append("   { \n");
00565           //find the first occurring index
00566           source.append("      if (entry_buffer[get_global_id(0)] < entry_buffer[get_global_id(0)+stride]) \n");
00567           source.append("      { \n");
00568           source.append("        index_buffer[get_global_id(0)] = index_buffer[get_global_id(0)+stride]; \n");
00569           source.append("        entry_buffer[get_global_id(0)] = entry_buffer[get_global_id(0)+stride]; \n");
00570           source.append("      } \n");
00571           source.append("    } \n");
00572           source.append("  } \n");
00573           source.append(" \n");
00574           source.append("  return index_buffer[0]; \n");
00575           source.append("} \n");
00576 
00577           source.append("__kernel void index_norm_inf( \n");
00578           source.append("          __global "); source.append(numeric_string); source.append(" * vec, \n");
00579           source.append("          unsigned int start1, \n");
00580           source.append("          unsigned int inc1, \n");
00581           source.append("          unsigned int size1, \n");
00582           source.append("          __local "); source.append(numeric_string); source.append(" * entry_buffer, \n");
00583           source.append("          __local unsigned int * index_buffer, \n");
00584           source.append("          __global unsigned int * result) \n");
00585           source.append("{ \n");
00586           source.append("  entry_buffer[get_global_id(0)] = 0; \n");
00587           source.append("  index_buffer[get_global_id(0)] = 0; \n");
00588           source.append("  unsigned int tmp = index_norm_inf_impl(vec, start1, inc1, size1, entry_buffer, index_buffer); \n");
00589           source.append("  if (get_global_id(0) == 0) *result = tmp; \n");
00590           source.append("} \n");
00591 
00592         }
00593 
00594 
00596 
00597         // main kernel class
00599         template <class TYPE>
00600         struct vector
00601         {
00602           static std::string program_name()
00603           {
00604             return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector";
00605           }
00606 
00607           static void init(viennacl::ocl::context & ctx)
00608           {
00609             viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx);
00610             std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
00611 
00612             static std::map<cl_context, bool> init_done;
00613             if (!init_done[ctx.handle().get()])
00614             {
00615               std::string source;
00616               source.reserve(8192);
00617 
00618               viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
00619 
00620               // fully parametrized kernels:
00621               generate_avbv(source, numeric_string);
00622 
00623               // kernels with mostly predetermined skeleton:
00624               generate_plane_rotation(source, numeric_string);
00625               generate_vector_swap(source, numeric_string);
00626               generate_assign_cpu(source, numeric_string);
00627 
00628               generate_inner_prod(source, numeric_string, 1);
00629               generate_norm(source, numeric_string);
00630               generate_sum(source, numeric_string);
00631               generate_index_norm_inf(source, numeric_string);
00632 
00633               std::string prog_name = program_name();
00634               #ifdef VIENNACL_BUILD_INFO
00635               std::cout << "Creating program " << prog_name << std::endl;
00636               #endif
00637               ctx.add_program(source, prog_name);
00638               init_done[ctx.handle().get()] = true;
00639             } //if
00640           } //init
00641         };
00642 
00643         // class with kernels for multiple inner products.
00645         template <class TYPE>
00646         struct vector_multi_inner_prod
00647         {
00648           static std::string program_name()
00649           {
00650             return viennacl::ocl::type_to_string<TYPE>::apply() + "_vector_multi";
00651           }
00652 
00653           static void init(viennacl::ocl::context & ctx)
00654           {
00655             viennacl::ocl::DOUBLE_PRECISION_CHECKER<TYPE>::apply(ctx);
00656             std::string numeric_string = viennacl::ocl::type_to_string<TYPE>::apply();
00657 
00658             static std::map<cl_context, bool> init_done;
00659             if (!init_done[ctx.handle().get()])
00660             {
00661               std::string source;
00662               source.reserve(8192);
00663 
00664               viennacl::ocl::append_double_precision_pragma<TYPE>(ctx, source);
00665 
00666               generate_inner_prod(source, numeric_string, 2);
00667               generate_inner_prod(source, numeric_string, 3);
00668               generate_inner_prod(source, numeric_string, 4);
00669               generate_inner_prod(source, numeric_string, 8);
00670 
00671               generate_inner_prod_sum(source, numeric_string);
00672 
00673               std::string prog_name = program_name();
00674               #ifdef VIENNACL_BUILD_INFO
00675               std::cout << "Creating program " << prog_name << std::endl;
00676               #endif
00677               ctx.add_program(source, prog_name);
00678               init_done[ctx.handle().get()] = true;
00679             } //if
00680           } //init
00681         };
00682 
00683       }  // namespace kernels
00684     }  // namespace opencl
00685   }  // namespace linalg
00686 }  // namespace viennacl
00687 #endif
00688