ViennaCL - The Vienna Computing Library  1.5.2
viennacl/generator/vector_reduction.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_GENERATOR_GENERATE_VECTOR_REDUCTION_HPP
00002 #define VIENNACL_GENERATOR_GENERATE_VECTOR_REDUCTION_HPP
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00021 
00027 #include <vector>
00028 
00029 #include "viennacl/scheduler/forwards.h"
00030 
00031 #include "viennacl/generator/mapped_objects.hpp"
00032 #include "viennacl/generator/helpers.hpp"
00033 #include "viennacl/generator/utils.hpp"
00034 
00035 #include "viennacl/generator/profile_base.hpp"
00036 
00037 #include "viennacl/tools/tools.hpp"
00038 
00039 namespace viennacl{
00040 
00041   namespace generator{
00042 
00044     class vector_reduction : public profile_base{
00045 
00046         vcl_size_t lmem_used(vcl_size_t scalartype_size) const {
00047           return m_*(k_+1)*scalartype_size;
00048         }
00049 
00050       public:
00052         vector_reduction(unsigned int vectorization, unsigned int m, unsigned int k, unsigned int num_groups) : profile_base(vectorization, m, k, 1), m_(m), k_(k), num_groups_(num_groups){ }
00053 
00054 
00055         static std::string csv_format() {
00056           return "Vec,M,K,NumGroups";
00057         }
00058 
00059         std::string csv_representation() const{
00060           std::ostringstream oss;
00061           oss << vector_size_
00062                  << "," << m_
00063                  << "," << k_
00064                  << "," << num_groups_;
00065           return oss.str();
00066         }
00067 
00068         unsigned int m() const { return m_; }
00069 
00070         unsigned int k() const { return k_; }
00071 
00072         unsigned int num_groups() const { return num_groups_; }
00073 
00074         void configure_range_enqueue_arguments(vcl_size_t kernel_id, statements_type  const & statements, viennacl::ocl::kernel & kernel, unsigned int & n_arg)  const{
00075 
00076           configure_local_sizes(kernel, kernel_id);
00077           kernel.global_work_size(0,m_*num_groups_);
00078           kernel.global_work_size(1,k_);
00079 
00080 
00081           for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00082             scheduler::statement::container_type exprs = it->first.array();
00083             for(scheduler::statement::container_type::iterator iit = exprs.begin() ; iit != exprs.end() ; ++iit){
00084               if(iit->op.type==scheduler::OPERATION_BINARY_MAT_VEC_PROD_TYPE){
00085                 scheduler::statement_node const * current_node = &(*iit);
00086                 //The LHS of the prod is a matrix
00087                 if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
00088                 {
00089                   kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
00090                   kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
00091                   return;
00092                 }
00093                 else{
00094                   //The LHS of the prod is a matrix expression
00095                   current_node = &exprs[current_node->lhs.node_index];
00096                   if(current_node->lhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
00097                   {
00098                     kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
00099                     kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
00100                     return;
00101                   }
00102                   else if(current_node->rhs.type_family==scheduler::MATRIX_TYPE_FAMILY)
00103                   {
00104                     kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size1_fun())));
00105                     kernel.arg(n_arg++, cl_uint(utils::call_on_matrix(current_node->lhs, utils::internal_size2_fun())));
00106                     return;
00107                   }
00108                   else{
00109                     assert(false && bool("unexpected expression tree"));
00110                   }
00111                 }
00112                 return;
00113               }
00114             }
00115           }
00116         }
00117 
00118         void kernel_arguments(statements_type  const & /*statements*/, std::string & arguments_string) const{
00119           arguments_string += detail::generate_value_kernel_argument("unsigned int", "M");
00120           arguments_string += detail::generate_value_kernel_argument("unsigned int", "N");
00121         }
00122 
00123       private:
00124         void core(vcl_size_t /*kernel_id*/, utils::kernel_generation_stream& stream, statements_type const & statements, std::vector<detail::mapping_type> const & mapping) const {
00125 
00126           std::vector<detail::mapped_vector_reduction*> exprs;
00127           for(std::vector<detail::mapping_type>::const_iterator it = mapping.begin() ; it != mapping.end() ; ++it){
00128             for(detail::mapping_type::const_iterator iit = it->begin() ; iit != it->end() ; ++iit){
00129               if(detail::mapped_vector_reduction * p = dynamic_cast<detail::mapped_vector_reduction*>(iit->second.get()))
00130                 exprs.push_back(p);
00131               if(detail::mapped_matrix * p = dynamic_cast<detail::mapped_matrix*>(iit->second.get()))
00132                 p->bind_sizes("M","N");
00133             }
00134           }
00135 
00136           vcl_size_t lsize1 = m_;
00137           vcl_size_t lsize2 = k_+1;
00138           std::string scalartype = "float";
00139           bool is_lhs_transposed = false;
00140           if(exprs.front()->root_node().lhs.type_family==scheduler::COMPOSITE_OPERATION_FAMILY)
00141             if(exprs.front()->statement().array()[exprs.front()->root_node().lhs.node_index].op.type==scheduler::OPERATION_UNARY_TRANS_TYPE)
00142               is_lhs_transposed = true;
00143 
00144           std::string size1 = "M", size2 = "N";
00145           if(is_lhs_transposed)
00146             std::swap(size1, size2);
00147 
00148           for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
00149             stream << "__local " <<  (*it)->scalartype() << " buf" << std::distance(exprs.begin(), it) << '[' << lsize1*lsize2 << "];" << std::endl;
00150           }
00151 
00152           stream << "unsigned int lid0 = get_local_id(0);" << std::endl;
00153           stream << "unsigned int lid1 = get_local_id(1);" << std::endl;
00154 
00155 
00156           stream << "for(unsigned int r = get_global_id(0) ; r < " << size1 << " ; r += get_global_size(0)){" << std::endl;
00157           stream.inc_tab();
00158 
00159           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k)
00160             stream << scalartype << " sum" << k << " = 0;" << std::endl;
00161 
00162           stream << "for(unsigned int c = get_local_id(1) ; c < " << size2 << " ; c += get_local_size(1)){" << std::endl;
00163           stream.inc_tab();
00164 
00165           std::set<std::string>  fetched;
00166 
00167           for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
00168             viennacl::scheduler::statement const & statement = (*it)->statement();
00169             viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
00170             if(is_lhs_transposed)
00171               detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("c", "r"),vector_size_,stream,(*it)->mapping());
00172             else
00173               detail::fetch_all_lhs(fetched,statement,root_node, std::make_pair("r", "c"),vector_size_,stream,(*it)->mapping());
00174 
00175             detail::fetch_all_rhs(fetched,statement,root_node, std::make_pair("c", "0"),vector_size_,stream,(*it)->mapping());
00176           }
00177 
00178 
00179           //Update sums;
00180           for(std::vector<detail::mapped_vector_reduction*>::iterator it = exprs.begin() ; it != exprs.end() ; ++it){
00181             viennacl::scheduler::statement const & statement = (*it)->statement();
00182             viennacl::scheduler::statement_node const & root_node = (*it)->root_node();
00183             std::string str;
00184             detail::generate_all_lhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
00185             str += "*";
00186             detail::generate_all_rhs(statement,root_node,std::make_pair("i","0"),-1,str,(*it)->mapping());
00187             stream << " sum" << std::distance(exprs.begin(),it) << " += "  << str << ";" << std::endl;
00188           }
00189 
00190 
00191           stream.dec_tab();
00192           stream << "}" << std::endl;
00193 
00194           for(vcl_size_t k = 0 ; k < exprs.size() ; ++k){
00195             stream << "buf" << k << "[lid0*" << lsize2 << "+ lid1] = sum" << k << ";" << std::endl;
00196           }
00197 
00198           for(unsigned int stride = k_/2 ; stride>1 ; stride /=2){
00199             stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
00200             stream <<  "if(lid1 < " << stride << ")" ;
00201             stream << "{" << std::endl;
00202             stream.inc_tab();
00203 
00204             for(vcl_size_t i = 0 ; i < exprs.size() ; ++i)
00205               stream << "buf" << i << "[lid0*" << lsize2 << "+ lid1] += buf" << i << "[lid0*" << lsize2 << "+ lid1 + " << stride << "];" << std::endl;
00206 
00207             stream.dec_tab();
00208             stream << "}" << std::endl;
00209           }
00210 
00211 
00212           stream << "barrier(CLK_LOCAL_MEM_FENCE); " << std::endl;
00213           stream <<  "if(lid1 == 0)" ;
00214           stream << "{" << std::endl;
00215           stream.inc_tab();
00216           for(vcl_size_t i = 0 ; i < exprs.size() ; ++i){
00217             stream << "buf" << i << "[lid0*" << lsize2 << "] += buf" << i << "[lid0*" << lsize2 << "+ 1];" << std::endl;
00218             exprs[i]->access_name("buf"+utils::to_string(i)+"[lid0*"+utils::to_string(lsize2)+"]");
00219           }
00220           vcl_size_t i = 0;
00221           for(statements_type::const_iterator it = statements.begin() ; it != statements.end() ; ++it){
00222             std::string str;
00223             detail::traverse(it->first, it->second, detail::expression_generation_traversal(std::make_pair("r","0"), -1, str, mapping[i++]), false);
00224             stream << str << ";" << std::endl;
00225           }
00226           stream.dec_tab();
00227           stream << "}" << std::endl;
00228 
00229 
00230           stream.dec_tab();
00231           stream << "}" << std::endl;
00232 
00233         }
00234 
00235       private:
00236         unsigned int m_;
00237         unsigned int k_;
00238         unsigned int num_groups_;
00239     };
00240   }
00241 }
00242 
00243 #endif