ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/opencl/kernels/matrix.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
00002 #define VIENNACL_LINALG_OPENCL_KERNELS_MATRIX_HPP
00003 
00004 #include "viennacl/tools/tools.hpp"
00005 #include "viennacl/ocl/kernel.hpp"
00006 #include "viennacl/ocl/platform.hpp"
00007 #include "viennacl/ocl/utils.hpp"
00008 
00011 namespace viennacl
00012 {
00013   namespace linalg
00014   {
00015     namespace opencl
00016     {
00017       namespace kernels
00018       {
00019 
00021 
00023         enum ambm_scalar_type
00024         {
00025           VIENNACL_AMBM_NONE = 0, // vector does not exist/contribute
00026           VIENNACL_AMBM_CPU,
00027           VIENNACL_AMBM_GPU
00028         };
00029 
00031         struct ambm_config
00032         {
00033           ambm_config() : with_stride_and_range(true), is_row_major(true), a(VIENNACL_AMBM_CPU), b(VIENNACL_AMBM_NONE) {}
00034 
00035           bool with_stride_and_range;
00036           bool is_row_major;
00037           std::string      assign_op;
00038           ambm_scalar_type a;
00039           ambm_scalar_type b;
00040         };
00041 
00042         // just returns the for-loop
00043         template <typename StringType>
00044         void generate_ambm_impl2(StringType & source, ambm_config const & cfg, bool mult_alpha, bool mult_beta)
00045         {
00046           if (cfg.is_row_major)
00047           {
00048             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
00049             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
00050             source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00051             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00052           }
00053           else
00054           {
00055             source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
00056             source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
00057             source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00058             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00059           }
00060 
00061           if (cfg.with_stride_and_range)
00062           {
00063             if (cfg.is_row_major)
00064               source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] ");
00065             else
00066               source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] ");
00067             source.append(cfg.assign_op);
00068             if (cfg.is_row_major)
00069               source.append(" B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] ");
00070             else
00071               source.append(" B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) * B_internal_size1] ");
00072 
00073             if (mult_alpha)
00074               source.append("* alpha ");
00075             else
00076               source.append("/ alpha ");
00077             if (cfg.b != VIENNACL_AMBM_NONE)
00078             {
00079               if (cfg.is_row_major)
00080                 source.append("+ C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)] ");
00081               else
00082                 source.append("+ C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) * C_internal_size1] ");
00083               if (mult_beta)
00084                 source.append("* beta");
00085               else
00086                 source.append("/ beta");
00087             }
00088           }
00089           else
00090           {
00091             if (cfg.is_row_major)
00092               source.append("    A[row * A_internal_size2 + col] ");
00093             else
00094               source.append("    A[row + col * A_internal_size1] ");
00095             source.append(cfg.assign_op);
00096             if (cfg.is_row_major)
00097               source.append(" B[row * B_internal_size2 + col] ");
00098             else
00099               source.append(" B[row + col * B_internal_size1] ");
00100 
00101             if (mult_alpha)
00102               source.append("* alpha ");
00103             else
00104               source.append("/ alpha ");
00105             if (cfg.b != VIENNACL_AMBM_NONE)
00106             {
00107               if (cfg.is_row_major)
00108                 source.append("+ C[row * C_internal_size2 + col] ");
00109               else
00110                 source.append("+ C[row + col * C_internal_size2] ");
00111               if (mult_beta)
00112                 source.append("* beta");
00113               else
00114                 source.append("/ beta");
00115             }
00116           }
00117           source.append("; \n");
00118         }
00119 
00120         template <typename StringType>
00121         void generate_ambm_impl(StringType & source, std::string const & numeric_string, ambm_config const & cfg)
00122         {
00123           source.append("__kernel void am");
00124           if (cfg.b != VIENNACL_AMBM_NONE)
00125             source.append("bm");
00126           if (cfg.assign_op != "=")
00127             source.append("_m");
00128 
00129           if (cfg.a == VIENNACL_AMBM_CPU)
00130             source.append("_cpu");
00131           else if (cfg.a == VIENNACL_AMBM_GPU)
00132             source.append("_gpu");
00133 
00134           if (cfg.b == VIENNACL_AMBM_CPU)
00135             source.append("_cpu");
00136           else if (cfg.b == VIENNACL_AMBM_GPU)
00137             source.append("_gpu");
00138           source.append("( \n");
00139           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00140           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00141           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00142           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00143           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00144           if (cfg.a == VIENNACL_AMBM_CPU)
00145           {
00146             source.append("  "); source.append(numeric_string); source.append(" fac2, \n");
00147           }
00148           else if (cfg.a == VIENNACL_AMBM_GPU)
00149           {
00150             source.append("  __global "); source.append(numeric_string); source.append(" * fac2, \n");
00151           }
00152           source.append("  unsigned int options2, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
00153           source.append("  __global const "); source.append(numeric_string); source.append(" * B, \n");
00154           source.append("  unsigned int B_start1, unsigned int B_start2, \n");
00155           source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
00156           source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2");
00157 
00158           if (cfg.b != VIENNACL_AMBM_NONE)
00159           {
00160             source.append(", \n\n");
00161             if (cfg.b == VIENNACL_AMBM_CPU)
00162             {
00163               source.append("  "); source.append(numeric_string); source.append(" fac3, \n");
00164             }
00165             else if (cfg.b == VIENNACL_AMBM_GPU)
00166             {
00167               source.append("  __global "); source.append(numeric_string); source.append(" * fac3, \n");
00168             }
00169             source.append("  unsigned int options3, \n");  // 0: no action, 1: flip sign, 2: take inverse, 3: flip sign and take inverse
00170             source.append("  __global const "); source.append(numeric_string); source.append(" * C, \n");
00171             source.append("  unsigned int C_start1, unsigned int C_start2, \n");
00172             source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
00173             source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2 \n");
00174           }
00175           source.append(") { \n");
00176 
00177           if (cfg.a == VIENNACL_AMBM_CPU)
00178           {
00179             source.append("  "); source.append(numeric_string); source.append(" alpha = fac2; \n");
00180           }
00181           else if (cfg.a == VIENNACL_AMBM_GPU)
00182           {
00183             source.append("  "); source.append(numeric_string); source.append(" alpha = fac2[0]; \n");
00184           }
00185           source.append("  if (options2 & (1 << 0)) \n");
00186           source.append("    alpha = -alpha; \n");
00187           source.append(" \n");
00188 
00189           if (cfg.b == VIENNACL_AMBM_CPU)
00190           {
00191             source.append("  "); source.append(numeric_string); source.append(" beta = fac3; \n");
00192           }
00193           else if (cfg.b == VIENNACL_AMBM_GPU)
00194           {
00195             source.append("  "); source.append(numeric_string); source.append(" beta = fac3[0]; \n");
00196           }
00197           if (cfg.b != VIENNACL_AMBM_NONE)
00198           {
00199             source.append("  if (options3 & (1 << 0)) \n");
00200             source.append("    beta = -beta; \n");
00201             source.append(" \n");
00202           }
00203           source.append("  if (options2 & (1 << 1)) { \n");
00204           if (cfg.b != VIENNACL_AMBM_NONE)
00205           {
00206             source.append("    if (options3 & (1 << 1)) {\n");
00207             generate_ambm_impl2(source, cfg, false, false);
00208             source.append("    } else {\n");
00209             generate_ambm_impl2(source, cfg, false, true);
00210             source.append("    } \n");
00211           }
00212           else
00213             generate_ambm_impl2(source, cfg, false, true);
00214           source.append("  } else { \n");
00215           if (cfg.b != VIENNACL_AMBM_NONE)
00216           {
00217             source.append("    if (options3 & (1 << 1)) {\n");
00218             generate_ambm_impl2(source, cfg, true, false);
00219             source.append("    } else {\n");
00220             generate_ambm_impl2(source, cfg, true, true);
00221             source.append("    } \n");
00222           }
00223           else
00224             generate_ambm_impl2(source, cfg, true, true);
00225           source.append("  } \n");
00226           source.append("} \n");
00227         }
00228 
00229         template <typename StringType>
00230         void generate_ambm(StringType & source, std::string const & numeric_string, bool is_row_major)
00231         {
00232           ambm_config cfg;
00233           cfg.assign_op = "=";
00234           cfg.with_stride_and_range = true;
00235           cfg.is_row_major = is_row_major;
00236 
00237           // am
00238           cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00239           cfg.b = VIENNACL_AMBM_NONE; cfg.a = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00240 
00241           // ambm
00242           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00243           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00244           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00245           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00246 
00247           // ambm_m
00248           cfg.assign_op = "+=";
00249 
00250           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00251           cfg.a = VIENNACL_AMBM_CPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00252           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_CPU; generate_ambm_impl(source, numeric_string, cfg);
00253           cfg.a = VIENNACL_AMBM_GPU; cfg.b = VIENNACL_AMBM_GPU; generate_ambm_impl(source, numeric_string, cfg);
00254         }
00255 
00256         template <typename StringType>
00257         void generate_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
00258         {
00259           source.append("__kernel void assign_cpu( \n");
00260           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00261           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00262           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00263           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00264           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00265           source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
00266           source.append("{ \n");
00267           if (is_row_major)
00268           {
00269             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
00270             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
00271             source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00272             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00273             source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = alpha; \n");
00274           }
00275           else
00276           {
00277             source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
00278             source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
00279             source.append("  for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00280             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00281             source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
00282           }
00283           source.append("} \n");
00284         }
00285 
00286         template <typename StringType>
00287         void generate_diagonal_assign_cpu(StringType & source, std::string const & numeric_string, bool is_row_major)
00288         {
00289           source.append("__kernel void diagonal_assign_cpu( \n");
00290           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00291           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00292           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00293           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00294           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00295           source.append("  "); source.append(numeric_string); source.append(" alpha) \n");
00296           source.append("{ \n");
00297           source.append("  for (unsigned int idx = get_global_id(0); idx < min(A_size1, A_size2); idx += get_global_size(0))\n");
00298           if (is_row_major)
00299             source.append("    A[(idx * A_inc1 + A_start1) * A_internal_size2 + (idx * A_inc2 + A_start2)] = alpha; \n");
00300           else
00301             source.append("    A[(idx * A_inc1 + A_start1) + (idx * A_inc2 + A_start2) *  A_internal_size1] = alpha; \n");
00302           source.append("} \n");
00303         }
00304 
00305         template <typename StringType>
00306         void generate_element_op(StringType & source, std::string const & numeric_string, bool is_row_major)
00307         {
00308           source.append("__kernel void element_op( \n");
00309           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00310           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00311           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00312           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00313           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00314           source.append("  __global "); source.append(numeric_string); source.append(" * B, \n");
00315           source.append("  unsigned int B_start1, unsigned int B_start2, \n");
00316           source.append("  unsigned int B_inc1,   unsigned int B_inc2, \n");
00317           source.append("  unsigned int B_internal_size1,  unsigned int B_internal_size2, \n");
00318           source.append("  __global "); source.append(numeric_string); source.append(" * C, \n");
00319           source.append("  unsigned int C_start1, unsigned int C_start2, \n");
00320           source.append("  unsigned int C_inc1,   unsigned int C_inc2, \n");
00321           source.append("  unsigned int C_internal_size1,  unsigned int C_internal_size2, \n");
00322           source.append("  unsigned int op_type) \n"); //0: product, 1: division, 2: pow
00323           source.append("{ \n");
00324           if (is_row_major)
00325           {
00326             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0);\n");
00327             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0);\n");
00328             source.append("  if (op_type == 2) {");
00329             if (numeric_string == "float" || numeric_string == "double")
00330             {
00331               source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00332               source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00333               source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
00334               source.append("        pow(B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)], \n");
00335               source.append("            C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]); \n");
00336             }
00337             source.append("  } else if (op_type == 1) {");
00338             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00339             source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00340             source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
00341             source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] / \n");
00342             source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
00343             source.append("  } else if (op_type == 0) {");
00344             source.append("    for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0))\n");
00345             source.append("      for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0))\n");
00346             source.append("        A[(row * A_inc1 + A_start1) * A_internal_size2 + (col * A_inc2 + A_start2)] = \n");
00347             source.append("        B[(row * B_inc1 + B_start1) * B_internal_size2 + (col * B_inc2 + B_start2)] * \n");
00348             source.append("        C[(row * C_inc1 + C_start1) * C_internal_size2 + (col * C_inc2 + C_start2)]; \n");
00349             source.append("  }");
00350           }
00351           else
00352           {
00353             source.append("  unsigned int row_gid = get_global_id(0) % get_local_size(0);\n");
00354             source.append("  unsigned int col_gid = get_global_id(0) / get_local_size(0);\n");
00355             source.append("  if (op_type == 2) {");
00356             if (numeric_string == "float" || numeric_string == "double")
00357             {
00358               source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00359               source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00360               source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
00361               source.append("          pow(B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1], \n");
00362               source.append("              C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]); \n");
00363             }
00364             source.append("  } else if (op_type == 1) {");
00365             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00366             source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00367             source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] =  \n");
00368             source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] / \n");
00369             source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
00370             source.append("  } else if (op_type == 0) {");
00371             source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_num_groups(0))\n");
00372             source.append("      for (unsigned int row = row_gid; row < A_size1; row += get_local_size(0))\n");
00373             source.append("        A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) *  A_internal_size1] = \n");
00374             source.append("          B[(row * B_inc1 + B_start1) + (col * B_inc2 + B_start2) *  B_internal_size1] * \n");
00375             source.append("          C[(row * C_inc1 + C_start1) + (col * C_inc2 + C_start2) *  C_internal_size1]; \n");
00376             source.append("  }");
00377           }
00378           source.append("} \n");
00379         }
00380 
00381 
00382         template <typename StringType>
00383         void generate_fft(StringType & source, std::string const & numeric_string, bool is_row_major)
00384         {
00385           // naive fourier transform (quadratic complexity, use for reference only)
00386           source.append("__kernel void fft_direct(__global "); source.append(numeric_string); source.append("2 *input, \n");
00387           source.append("                         __global "); source.append(numeric_string); source.append("2 *output, \n");
00388           source.append("                         unsigned int size, \n");
00389           source.append("                         unsigned int stride, \n");
00390           source.append("                         unsigned int batch_num, \n");
00391           source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
00392           source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
00393           source.append(" \n");
00394           source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
00395           source.append("        for(unsigned int k = get_global_id(0); k < size; k += get_global_size(0)) { \n");
00396           source.append("            "); source.append(numeric_string); source.append("2 f = 0.0f; \n");
00397           source.append(" \n");
00398           source.append("            for(unsigned int n = 0; n < size; n++) { \n");
00399           source.append("                "); source.append(numeric_string); source.append("2 in = ");
00400           if (is_row_major)
00401             source.append("input[batch_id * stride + n]; \n"); //input index here
00402           else
00403             source.append("input[n * stride + batch_id]; \n"); //input index here
00404           source.append(" \n");
00405           source.append("                "); source.append(numeric_string); source.append(" sn, cs; \n");
00406           source.append("                "); source.append(numeric_string); source.append(" arg = sign * 2 * NUM_PI * k / size * n; \n");
00407           source.append("                sn = sincos(arg, &cs); \n");
00408           source.append(" \n");
00409           source.append("                "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
00410           source.append("                f = f + ("); source.append(numeric_string); source.append("2)(in.x * ex.x - in.y * ex.y, in.x * ex.y + in.y * ex.x); \n");
00411           source.append("            } \n");
00412           source.append(" \n");
00413           if (is_row_major)
00414             source.append("            output[batch_id * stride + k] = f; \n"); // output index here
00415           else
00416             source.append("            output[k * stride + batch_id] = f; \n"); // output index here
00417           source.append("        } \n");
00418           source.append("    } \n");
00419           source.append("} \n");
00420 
00421           source.append(" \n"); 
00422 
00423           source.append("__kernel void fft_radix2(__global "); source.append(numeric_string); source.append("2* input, \n");
00424           source.append("                         unsigned int s, \n");
00425           source.append("                         unsigned int bit_size, \n");
00426           source.append("                         unsigned int size, \n");
00427           source.append("                         unsigned int stride, \n");
00428           source.append("                         unsigned int batch_num, \n");
00429           source.append("                         "); source.append(numeric_string); source.append(" sign) { \n");
00430           source.append(" \n");
00431           source.append("    unsigned int ss = 1 << s; \n");
00432           source.append("    unsigned int half_size = size >> 1; \n");
00433           source.append(" \n");
00434           source.append("    "); source.append(numeric_string); source.append(" cs, sn; \n");
00435           source.append("    const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
00436           source.append(" \n");
00437           source.append("    unsigned int glb_id = get_global_id(0); \n");
00438           source.append("    unsigned int glb_sz = get_global_size(0); \n");
00439 
00440           source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
00441           source.append("        for(unsigned int tid = glb_id; tid < half_size; tid += glb_sz) { \n");
00442           source.append("            unsigned int group = (tid & (ss - 1)); \n");
00443           source.append("            unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
00444 
00445           if (is_row_major)
00446           {
00447             source.append("            unsigned int offset = batch_id * stride + pos; \n");
00448             source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
00449             source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss]; \n");//index
00450           }
00451           else
00452           {
00453             source.append("            unsigned int offset = pos * stride + batch_id; \n");
00454             source.append("            "); source.append(numeric_string); source.append("2 in1 = input[offset]; \n"); //index
00455             source.append("            "); source.append(numeric_string); source.append("2 in2 = input[offset + ss * stride]; \n");//index
00456           }
00457 
00458           source.append("            "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
00459 
00460           source.append("            sn = sincos(arg, &cs); \n");
00461 
00462           source.append("            "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
00463 
00464           source.append("            "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
00465 
00466           if (is_row_major)
00467             source.append("            input[offset + ss] = in1 - tmp; \n");//index
00468           else
00469             source.append("            input[offset + ss * stride] = in1 - tmp; \n");//index
00470           source.append("            input[offset] = in1 + tmp; \n");//index
00471           source.append("        } \n");
00472           source.append("    } \n");
00473           source.append("} \n");
00474 
00475           source.append(" \n"); 
00476 
00477           source.append(" unsigned int get_reorder_num(unsigned int v, unsigned int bit_size) { \n");
00478           source.append("     v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
00479           source.append("     v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
00480           source.append("     v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
00481           source.append("     v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
00482           source.append("     v = (v >> 16) | (v << 16); \n");
00483           source.append("  \n");
00484           source.append("     v = v >> (32 - bit_size); \n");
00485           source.append("  \n");
00486           source.append("     return v; \n");
00487           source.append(" } \n");
00488 
00489           source.append(" __kernel void fft_radix2_local(__global "); source.append(numeric_string); source.append("2* input, \n");
00490           source.append("                                 __local "); source.append(numeric_string); source.append("2* lcl_input, \n");
00491           source.append("                                 unsigned int bit_size, \n");
00492           source.append("                                 unsigned int size, \n");
00493           source.append("                                 unsigned int stride, \n");
00494           source.append("                                 unsigned int batch_num, \n");
00495           source.append("                                 "); source.append(numeric_string); source.append(" sign) { \n");
00496 
00497           source.append("     unsigned int grp_id = get_group_id(0); \n");
00498           source.append("     unsigned int grp_num = get_num_groups(0); \n");
00499 
00500           source.append("     unsigned int lcl_sz = get_local_size(0); \n");
00501           source.append("     unsigned int lcl_id = get_local_id(0); \n");
00502           source.append("     const "); source.append(numeric_string); source.append(" NUM_PI = 3.14159265358979323846; \n");
00503 
00504           source.append("     for(unsigned int batch_id = grp_id; batch_id < batch_num; batch_id += grp_num) { \n");
00505                   //unsigned int base_offset = stride * batch_id; \n");
00506                   //copy chunk of global memory to local \n");
00507           source.append("         for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
00508           source.append("             unsigned int v = get_reorder_num(p, bit_size); \n");
00509           if (is_row_major)
00510             source.append("             lcl_input[v] = input[batch_id * stride + p]; \n"); //index
00511           else
00512             source.append("             lcl_input[v] = input[p * stride + batch_id]; \n"); //index
00513           source.append("         } \n");
00514 
00515           source.append("         barrier(CLK_LOCAL_MEM_FENCE); \n");
00516 
00517                   //performs Cooley-Tukey FFT on local array
00518           source.append("         for(unsigned int s = 0; s < bit_size; s++) { \n");
00519           source.append("             unsigned int ss = 1 << s; \n");
00520 
00521           source.append("             "); source.append(numeric_string); source.append(" cs, sn; \n");
00522 
00523           source.append("             for(unsigned int tid = lcl_id; tid < size; tid += lcl_sz) { \n");
00524           source.append("                 unsigned int group = (tid & (ss - 1)); \n");
00525           source.append("                 unsigned int pos = ((tid >> s) << (s + 1)) + group; \n");
00526 
00527           source.append("                 "); source.append(numeric_string); source.append("2 in1 = lcl_input[pos]; \n");
00528           source.append("                 "); source.append(numeric_string); source.append("2 in2 = lcl_input[pos + ss]; \n");
00529 
00530           source.append("                 "); source.append(numeric_string); source.append(" arg = group * sign * NUM_PI / ss; \n");
00531 
00532           source.append("                 sn = sincos(arg, &cs); \n");
00533           source.append("                 "); source.append(numeric_string); source.append("2 ex = ("); source.append(numeric_string); source.append("2)(cs, sn); \n");
00534 
00535           source.append("                 "); source.append(numeric_string); source.append("2 tmp = ("); source.append(numeric_string); source.append("2)(in2.x * ex.x - in2.y * ex.y, in2.x * ex.y + in2.y * ex.x); \n");
00536 
00537           source.append("                 lcl_input[pos + ss] = in1 - tmp; \n");
00538           source.append("                 lcl_input[pos] = in1 + tmp; \n");
00539           source.append("             } \n");
00540 
00541           source.append("             barrier(CLK_LOCAL_MEM_FENCE); \n");
00542           source.append("         } \n");
00543 
00544                   //copy local array back to global memory
00545           source.append("         for(unsigned int p = lcl_id; p < size; p += lcl_sz) { \n");
00546           if (is_row_major)
00547             source.append("             input[batch_id * stride + p] = lcl_input[p]; \n");//index
00548           else
00549             source.append("             input[p * stride + batch_id] = lcl_input[p]; \n");//index
00550           source.append("         } \n");
00551           source.append("     } \n");
00552           source.append(" } \n");
00553 
00554           source.append(" \n"); 
00555 
00556           //
00557           // Performs reordering of input data in bit-reversal order
00558           // Probably it's better to do in host side,
00559           //
00560           source.append("unsigned int get_reorder_num_2(unsigned int v, unsigned int bit_size) { \n");
00561           source.append("    v = ((v >> 1) & 0x55555555) | ((v & 0x55555555) << 1); \n");
00562           source.append("    v = ((v >> 2) & 0x33333333) | ((v & 0x33333333) << 2); \n");
00563           source.append("    v = ((v >> 4) & 0x0F0F0F0F) | ((v & 0x0F0F0F0F) << 4); \n");
00564           source.append("    v = ((v >> 8) & 0x00FF00FF) | ((v & 0x00FF00FF) << 8); \n");
00565           source.append("    v = (v >> 16) | (v << 16); \n");
00566 
00567           source.append("    v = v >> (32 - bit_size); \n");
00568 
00569           source.append("    return v; \n");
00570           source.append("} \n");
00571 
00572           source.append("__kernel void fft_reorder(__global "); source.append(numeric_string); source.append("2* input, \n");
00573           source.append("                          unsigned int bit_size, \n");
00574           source.append("                          unsigned int size, \n");
00575           source.append("                          unsigned int stride, \n");
00576           source.append("                          int batch_num) { \n");
00577 
00578           source.append("    unsigned int glb_id = get_global_id(0); \n");
00579           source.append("    unsigned int glb_sz = get_global_size(0); \n");
00580 
00581           source.append("    for(unsigned int batch_id = 0; batch_id < batch_num; batch_id++) { \n");
00582           source.append("        for(unsigned int i = glb_id; i < size; i += glb_sz) { \n");
00583           source.append("            unsigned int v = get_reorder_num_2(i, bit_size); \n");
00584 
00585           source.append("            if(i < v) {\n");
00586           if (is_row_major)
00587           {
00588             source.append("                "); source.append(numeric_string); source.append("2 tmp = input[batch_id * stride + i]; \n"); // index
00589             source.append("                input[batch_id * stride + i] = input[batch_id * stride + v]; \n"); //index
00590             source.append("                input[batch_id * stride + v] = tmp; \n"); //index
00591           }
00592           else
00593           {
00594             source.append("                "); source.append(numeric_string); source.append("2 tmp = input[i * stride + batch_id]; \n"); // index
00595             source.append("                input[i * stride + batch_id] = input[v * stride + batch_id]; \n"); //index
00596             source.append("                input[v * stride + batch_id] = tmp; \n"); //index
00597           }
00598           source.append("            } \n");
00599           source.append("        } \n");
00600           source.append("    } \n");
00601           source.append("} \n");
00602         }
00603 
00604         template <typename StringType>
00605         void generate_lu(StringType & source, std::string const & numeric_string, bool is_row_major)
00606         {
00607           source.append("__kernel void lu_factorize( \n");
00608           source.append("          __global "); source.append(numeric_string); source.append(" * matrix, \n");
00609           source.append("          unsigned int matrix_rows, \n");
00610           source.append("          unsigned int matrix_cols, \n");
00611           source.append("          unsigned int matrix_internal_rows, \n");
00612           source.append("          unsigned int matrix_internal_cols) \n");
00613           source.append("{ \n");
00614           source.append("  "); source.append(numeric_string); source.append(" temp; \n");
00615 
00616           if (is_row_major)
00617           {
00618             source.append("  unsigned rowi; \n");
00619             source.append("  unsigned rowk; \n");
00620             source.append("  for (unsigned int i=1; i<matrix_rows; ++i) \n");
00621             source.append("  { \n");
00622             source.append("    rowi = i * matrix_internal_cols; \n");
00623             source.append("    for (unsigned int k=0; k<i; ++k) \n");
00624             source.append("    { \n");
00625             source.append("      rowk = k * matrix_internal_cols; \n");
00626             source.append("      if (get_global_id(0) == 0) \n");
00627             source.append("        matrix[rowi + k] /= matrix[rowk + k]; \n");
00628 
00629             source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00630             source.append("      temp = matrix[rowi + k]; \n");
00631 
00632             //parallel subtraction:
00633             source.append("      for (unsigned int j=k+1 + get_global_id(0); j<matrix_rows; j += get_global_size(0)) \n");
00634             source.append("        matrix[rowi + j] -= temp * matrix[rowk + j]; \n");
00635           }
00636           else
00637           {
00638             source.append("      for (unsigned int i=1; i<matrix_rows; ++i) \n");
00639             source.append("      { \n");
00640             source.append("        for (unsigned int k=0; k<i; ++k) \n");
00641             source.append("        { \n");
00642 
00643             source.append("          if (get_global_id(0) == 0) \n");
00644             source.append("            matrix[i + k*matrix_internal_rows] /= matrix[k + k*matrix_internal_rows]; \n");
00645 
00646             source.append("          barrier(CLK_GLOBAL_MEM_FENCE); \n");
00647             source.append("          temp = matrix[i + k*matrix_internal_rows]; \n");
00648 
00649             //parallel subtraction:
00650             source.append("          for (unsigned int j=k+1 + get_global_id(0); j<matrix_cols; j += get_global_size(0)) \n");
00651             source.append("            matrix[i + j*matrix_internal_rows] -= temp * matrix[k + j*matrix_internal_rows]; \n");
00652           }
00653           source.append("   }");
00654           source.append("  }");
00655           source.append("}");
00656         }
00657 
00658 
00659         template <typename StringType>
00660         void generate_scaled_rank1_update(StringType & source, std::string const & numeric_string, bool is_row_major, bool alpha_on_cpu)
00661         {
00662           source.append("__kernel void scaled_rank1_update_"); alpha_on_cpu ? source.append("cpu") : source.append("gpu"); source.append("( \n");
00663           source.append("  __global "); source.append(numeric_string); source.append(" * A, \n");
00664           source.append("  unsigned int A_start1, unsigned int A_start2, \n");
00665           source.append("  unsigned int A_inc1,   unsigned int A_inc2, \n");
00666           source.append("  unsigned int A_size1,  unsigned int A_size2, \n");
00667           source.append("  unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00668 
00669           if (alpha_on_cpu) {
00670             source.append("  "); source.append(numeric_string); source.append(" val, \n");
00671           } else {
00672             source.append("  __global const "); source.append(numeric_string); source.append(" *val, \n");
00673           }
00674           source.append("  unsigned int options2, \n");
00675 
00676           source.append("  __global const "); source.append(numeric_string); source.append(" * vec1, \n");
00677           source.append("  unsigned int start1, \n");
00678           source.append("  unsigned int inc1, \n");
00679           source.append("  unsigned int size1, \n");
00680 
00681           source.append("  __global const "); source.append(numeric_string); source.append(" * vec2, \n");
00682           source.append("  unsigned int start2, \n");
00683           source.append("  unsigned int inc2, \n");
00684           source.append("  unsigned int size2) \n");
00685           source.append("{ \n");
00686 
00687           if (alpha_on_cpu) {
00688             source.append("  "); source.append(numeric_string); source.append(" alpha = val; \n");
00689           } else {
00690             source.append("  "); source.append(numeric_string); source.append(" alpha = val[0]; \n");
00691           }
00692           source.append("  if (options2 & (1 << 0)) \n");
00693           source.append("    alpha = -alpha; \n");
00694 
00695           source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
00696           source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
00697 
00698           source.append("  for (unsigned int row = row_gid; row < A_size1; row += get_num_groups(0)) \n");
00699           source.append("  { \n");
00700           source.append("    "); source.append(numeric_string); source.append(" tmp = vec1[row * inc1 + start1];");
00701           source.append("    tmp = (options2 & (1 << 1)) ? tmp / alpha : tmp * alpha;");
00702           source.append("    for (unsigned int col = col_gid; col < A_size2; col += get_local_size(0)) \n");
00703           if (is_row_major)
00704             source.append("      A[(row * A_inc1 + A_start1) * A_internal_size2 + col * A_inc2 + A_start2] += tmp * vec2[col * inc2 + start2]; \n");
00705           else
00706             source.append("      A[(row * A_inc1 + A_start1) + (col * A_inc2 + A_start2) * A_internal_size1] += tmp * vec2[col * inc2 + start2]; \n");
00707           source.append("  } \n");
00708           source.append("} \n");
00709         }
00710 
00711         template <typename StringType>
00712         void generate_trans_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
00713         {
00714           source.append("__kernel void trans_vec_mul( \n");
00715           source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
00716           source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
00717           source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
00718           source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
00719           source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
00720           source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
00721           source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
00722           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00723           source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
00724           source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
00725           source.append("{ \n");
00726           if (is_row_major)
00727           {
00728             source.append("  for (unsigned int row = get_global_id(0); row < A_col_size; row += get_global_size(0)) \n");
00729             source.append("  { \n");
00730             source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00731             source.append("    for (unsigned int col = 0; col < A_row_size; ++col) \n");
00732             source.append("      dot_prod += A[(row * A_col_inc + A_col_start) + (col * A_row_inc + A_row_start) * A_internal_cols] * v[v_start + v_inc * col]; \n");
00733             source.append("    result[row * result_inc + result_start] = dot_prod; \n");
00734           }
00735           else
00736           {
00737             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
00738             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
00739             source.append("  unsigned int lid = get_local_id(0); \n");
00740 
00741             source.append("  for (unsigned int row = row_gid; row < A_col_size; row += get_num_groups(0)) \n");
00742             source.append("  { \n");
00743             source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00744             source.append("    for (unsigned int col = col_gid; col < A_row_size; col+=get_local_size(0)) \n");
00745             source.append("      dot_prod += A[(row * A_col_inc + A_col_start) * A_internal_rows + col * A_row_inc + A_row_start] * v[v_start + v_inc * col]; \n");
00746             source.append("    work[lid] = dot_prod; \n");
00747 
00748             source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
00749             source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
00750             source.append("      if(lid < stride) \n");
00751             source.append("        work[lid] += work[lid+stride]; \n");
00752             source.append("    } \n");
00753 
00754             source.append("    if(lid == 0) \n");
00755             source.append("      result[row * result_inc + result_start] = work[0]; \n");
00756           }
00757           source.append("  } \n");
00758           source.append("} \n");
00759         }
00760 
00761         template <typename StringType>
00762         void generate_triangular_substitute_inplace(StringType & source, std::string const & numeric_string, bool is_row_major)
00763         {
00764           source.append("__kernel void triangular_substitute_inplace( \n");
00765           source.append("          __global "); source.append(numeric_string); source.append(" * A, \n");
00766           source.append("          unsigned int A_start1, unsigned int A_start2, \n");
00767           source.append("          unsigned int A_inc1,   unsigned int A_inc2, \n");
00768           source.append("          unsigned int A_size1,  unsigned int A_size2, \n");
00769           source.append("          unsigned int A_internal_size1,  unsigned int A_internal_size2, \n");
00770           source.append("          __global "); source.append(numeric_string); source.append(" * v, \n");
00771           source.append("          unsigned int v_start, \n");
00772           source.append("          unsigned int v_inc, \n");
00773           source.append("          unsigned int v_size, \n");
00774           source.append("          unsigned int options) \n");
00775           source.append("{ \n");
00776           source.append("  "); source.append(numeric_string); source.append(" temp; \n");
00777           source.append("  unsigned int unit_diagonal_flag  = (options & (1 << 0)); \n");
00778           source.append("  unsigned int transposed_access_A = (options & (1 << 1)); \n");
00779           source.append("  unsigned int is_lower_solve      = (options & (1 << 2)); \n");
00780           source.append("  unsigned int row; \n");
00781           source.append("  for (unsigned int rows_processed = 0; rows_processed < A_size1; ++rows_processed)  \n");   //Note: A required to be square
00782           source.append("  { \n");
00783           source.append("    row = is_lower_solve ? rows_processed : ((A_size1 - rows_processed) - 1); \n");
00784           source.append("    if (!unit_diagonal_flag) \n");
00785           source.append("    { \n");
00786           source.append("      barrier(CLK_GLOBAL_MEM_FENCE); \n");
00787           source.append("      if (get_global_id(0) == 0) \n");
00788           if (is_row_major)
00789             source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) * A_internal_size2 + (row * A_inc2 + A_start2)]; \n");
00790           else
00791             source.append("        v[row * v_inc + v_start] /= A[(row * A_inc1 + A_start1) + (row * A_inc2 + A_start2) * A_internal_size1]; \n");
00792           source.append("   } \n");
00793 
00794           source.append("    barrier(CLK_GLOBAL_MEM_FENCE); \n");
00795 
00796           source.append("    temp = v[row * v_inc + v_start]; \n");
00797 
00798           source.append("    for (int elim = (is_lower_solve ? (row + get_global_id(0) + 1) : get_global_id(0)); \n");
00799           source.append("             elim < (is_lower_solve ? A_size1 : row); \n");
00800           source.append("             elim += get_global_size(0)) \n");
00801           if (is_row_major)
00802           {
00803             source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) * A_internal_size2 + (elim * A_inc2 + A_start2)) \n");
00804             source.append("                                                                : ((elim * A_inc1 + A_start1) * A_internal_size2 + (row  * A_inc2 + A_start2))]; \n");
00805           }
00806           else
00807           {
00808             source.append("      v[elim * v_inc + v_start] -= temp * A[transposed_access_A ? ((row  * A_inc1 + A_start1) + (elim * A_inc2 + A_start2) * A_internal_size1) \n");
00809             source.append("                                                                : ((elim * A_inc1 + A_start1) + (row  * A_inc2 + A_start2) * A_internal_size1)]; \n");
00810           }
00811           source.append("  } \n");
00812           source.append("} \n");
00813         }
00814 
00815         template <typename StringType>
00816         void generate_vec_mul(StringType & source, std::string const & numeric_string, bool is_row_major)
00817         {
00818           source.append("__kernel void vec_mul( \n");
00819           source.append("          __global const "); source.append(numeric_string); source.append(" * A, \n");
00820           source.append("          unsigned int A_row_start, unsigned int A_col_start, \n");
00821           source.append("          unsigned int A_row_inc, unsigned int A_col_inc, \n");
00822           source.append("          unsigned int A_row_size, unsigned int A_col_size, \n");
00823           source.append("          unsigned int A_internal_rows, unsigned int A_internal_cols, \n");
00824           source.append("          __global const "); source.append(numeric_string); source.append(" * v, \n");
00825           source.append("          unsigned int v_start, unsigned int v_inc, unsigned int v_size, \n");
00826           source.append("          __global "); source.append(numeric_string); source.append(" * result, \n");
00827           source.append("          unsigned int result_start, unsigned int result_inc, unsigned int result_size, \n");
00828           source.append("          __local "); source.append(numeric_string); source.append(" * work) \n");
00829           source.append("{ \n");
00830           if (is_row_major)
00831           {
00832             source.append("  unsigned int row_gid = get_global_id(0) / get_local_size(0); \n");
00833             source.append("  unsigned int col_gid = get_global_id(0) % get_local_size(0); \n");
00834             source.append("  unsigned int lid = get_local_id(0); \n");
00835 
00836             source.append("  for (unsigned int row = row_gid; row < A_row_size; row += get_num_groups(0)) \n");
00837             source.append("  { \n");
00838             source.append("    "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00839             source.append("    for (unsigned int col = col_gid; col < A_col_size; col+=get_local_size(0)) \n");
00840             source.append("      dot_prod += A[(row * A_row_inc + A_row_start) * A_internal_cols + col * A_col_inc + A_col_start] * v[v_start + v_inc * col]; \n");
00841             source.append("    work[lid] = dot_prod; \n");
00842 
00843             source.append("    for(unsigned int stride=get_local_size(0)/2 ; stride>0 ; stride>>=1){ \n");
00844             source.append("      barrier(CLK_LOCAL_MEM_FENCE); \n");
00845             source.append("      if(lid < stride) \n");
00846             source.append("        work[lid] += work[lid+stride]; \n");
00847             source.append("    } \n");
00848 
00849             source.append("    if(lid == 0) \n");
00850             source.append("      result[row * result_inc + result_start] = work[0]; \n");
00851 
00852           }
00853           else
00854           {
00855             source.append("    for (unsigned int row = get_global_id(0); row < A_row_size; row += get_global_size(0)) \n");
00856             source.append("    { \n");
00857             source.append("      "); source.append(numeric_string); source.append(" dot_prod = 0; \n");
00858             source.append("      for (unsigned int col = 0; col < A_col_size; ++col) \n");
00859             source.append("        dot_prod += A[(row * A_row_inc + A_row_start) + (col * A_col_inc + A_col_start) * A_internal_rows] * v[v_start + v_inc * col]; \n");
00860             source.append("      result[row * result_inc + result_start] = dot_prod; \n");
00861           }
00862           source.append("  } \n");
00863           source.append("} \n");
00864         }
00865 
00866         namespace detail
00867         {
00868           inline std::string type_to_string(viennacl::row_major)    { return "row"; }
00869           inline std::string type_to_string(viennacl::column_major) { return "col"; }
00870         }
00871 
00873 
00874         // main kernel class
00876         template <typename NumericT, typename F>
00877         struct matrix
00878         {
00879           static std::string program_name()
00880           {
00881             return viennacl::ocl::type_to_string<NumericT>::apply() + "_matrix_" + detail::type_to_string(F());
00882           }
00883 
00884           static void init(viennacl::ocl::context & ctx)
00885           {
00886             viennacl::ocl::DOUBLE_PRECISION_CHECKER<NumericT>::apply(ctx);
00887             std::string numeric_string = viennacl::ocl::type_to_string<NumericT>::apply();
00888             bool is_row_major = viennacl::is_row_major<F>::value;
00889 
00890             static std::map<cl_context, bool> init_done;
00891             if (!init_done[ctx.handle().get()])
00892             {
00893               std::string source;
00894               source.reserve(8192);
00895 
00896               viennacl::ocl::append_double_precision_pragma<NumericT>(ctx, source);
00897 
00898               // fully parametrized kernels:
00899               generate_ambm(source, numeric_string, is_row_major);
00900 
00901               // kernels with mostly predetermined skeleton:
00902               generate_assign_cpu(source, numeric_string, is_row_major);
00903               generate_diagonal_assign_cpu(source, numeric_string, is_row_major);
00904               generate_element_op(source, numeric_string, is_row_major);
00905               generate_scaled_rank1_update(source, numeric_string, is_row_major, true);
00906               generate_scaled_rank1_update(source, numeric_string, is_row_major, false);
00907               generate_trans_vec_mul(source, numeric_string, is_row_major);
00908               generate_vec_mul(source, numeric_string, is_row_major);
00909 
00910               if (numeric_string == "float" || numeric_string == "double")
00911               {
00912                 generate_fft(source, numeric_string, is_row_major);
00913                 generate_lu(source, numeric_string, is_row_major);
00914                 generate_triangular_substitute_inplace(source, numeric_string, is_row_major);
00915               }
00916 
00917               std::string prog_name = program_name();
00918               #ifdef VIENNACL_BUILD_INFO
00919               std::cout << "Creating program " << prog_name << std::endl;
00920               #endif
00921               ctx.add_program(source, prog_name);
00922               init_done[ctx.handle().get()] = true;
00923             } //if
00924           } //init
00925         };
00926 
00927       }  // namespace kernels
00928     }  // namespace opencl
00929   }  // namespace linalg
00930 }  // namespace viennacl
00931 #endif
00932