ViennaCL - The Vienna Computing Library  1.6.2
Free open-source GPU-accelerated linear algebra and solver library.
 All Classes Namespaces Files Functions Variables Typedefs Enumerations Enumerator Friends Macros Pages
dense_blas.cpp
Go to the documentation of this file.
1 /* =========================================================================
2  Copyright (c) 2010-2014, Institute for Microelectronics,
3  Institute for Analysis and Scientific Computing,
4  TU Wien.
5  Portions of this software are copyright by UChicago Argonne, LLC.
6 
7  -----------------
8  ViennaCL - The Vienna Computing Library
9  -----------------
10 
11  Project Head: Karl Rupp rupp@iue.tuwien.ac.at
12 
13  (A list of authors and contributors can be found in the PDF manual)
14 
15  License: MIT (X11), see file LICENSE in the base directory
16 ============================================================================= */
17 
18 #include "benchmark-utils.hpp"
19 
20 #include "viennacl/matrix.hpp"
22 #include "viennacl/vector.hpp"
24 
26 #include "viennacl/linalg/prod.hpp"
27 #include "viennacl/linalg/lu.hpp"
28 
29 #include <iomanip>
30 #include <stdlib.h>
31 
32 template<class T, class F>
34 {
35  std::vector<T> cM(M.internal_size());
36  for (std::size_t i = 0; i < M.size1(); ++i)
37  for (std::size_t j = 0; j < M.size2(); ++j)
38  cM[F::mem_index(i, j, M.internal_size1(), M.internal_size2())] = T(rand())/T(RAND_MAX);
39  viennacl::fast_copy(&cM[0],&cM[0] + cM.size(),M);
40 }
41 
42 template<class T>
44 {
45  std::vector<T> cx(x.internal_size());
46  for (std::size_t i = 0; i < cx.size(); ++i)
47  cx[i] = T(rand())/T(RAND_MAX);
48  viennacl::fast_copy(&cx[0], &cx[0] + cx.size(), x.begin());
49 }
50 
51 template<class T>
52 void bench(size_t BLAS1_N, size_t BLAS2_M, size_t BLAS2_N, size_t BLAS3_M, size_t BLAS3_N, size_t BLAS3_K, std::string const & prefix)
53 {
57  using viennacl::trans;
58 
59  Timer timer;
60  double time_previous, time_spent;
61  size_t Nruns;
62  double time_per_benchmark = 1;
63 
64 #define BENCHMARK_OP(OPERATION, NAME, PERF, INDEX) \
65  OPERATION; \
66  viennacl::backend::finish();\
67  timer.start(); \
68  Nruns = 0; \
69  time_spent = 0; \
70  while (time_spent < time_per_benchmark) \
71  { \
72  time_previous = timer.get(); \
73  OPERATION; \
74  viennacl::backend::finish(); \
75  time_spent += timer.get() - time_previous; \
76  Nruns+=1; \
77  } \
78  time_spent/=(double)Nruns; \
79  std::cout << prefix << NAME " : " << PERF << " " INDEX << std::endl; \
80 
81  //BLAS1
82  {
84  T alpha = (T)2.4;
85  viennacl::vector<T> x(BLAS1_N);
86  viennacl::vector<T> y(BLAS1_N);
87  viennacl::vector<T> z(BLAS1_N);
88 
89  init_random(x);
90  init_random(y);
91  init_random(z);
92 
93  BENCHMARK_OP(x = y, "COPY", std::setprecision(3) << double(2*BLAS1_N*sizeof(T))/time_spent * 1e-9, "GB/s")
94  BENCHMARK_OP(x = y + alpha*x, "AXPY", std::setprecision(3) << double(3*BLAS1_N*sizeof(T))/time_spent * 1e-9, "GB/s")
95  BENCHMARK_OP(s = inner_prod(x, y), "DOT", std::setprecision(3) << double(2*BLAS1_N*sizeof(T))/time_spent * 1e-9, "GB/s")
96  }
97 
98 
99  //BLAS2
100  {
101  viennacl::matrix<T,viennacl::column_major> A(BLAS2_M, BLAS2_N);
102  viennacl::vector<T> x(BLAS2_N);
103  viennacl::vector<T> y(BLAS2_M);
104  init_random(A);
105  init_random(x);
106  init_random(y);
107 
108  BENCHMARK_OP(y = prod(A, x), "GEMV-N", std::setprecision(3) << double((BLAS3_M + BLAS3_N + BLAS3_M*BLAS3_N)*sizeof(T))/time_spent * 1e-9, "GB/s")
109  BENCHMARK_OP(x = prod(trans(A), y), "GEMV-T", std::setprecision(3) << double((BLAS3_M + BLAS3_N + BLAS3_M*BLAS3_N)*sizeof(T))/time_spent * 1e-9, "GB/s")
110  }
111 
112  //BLAS3
113  {
114  viennacl::matrix<T,viennacl::column_major> C(BLAS3_M, BLAS3_N);
115  viennacl::matrix<T,viennacl::column_major> A(BLAS3_M, BLAS3_K);
116  viennacl::matrix<T,viennacl::column_major> B(BLAS3_K, BLAS3_N);
119  init_random(A);
120  init_random(B);
121 
122  BENCHMARK_OP(C = prod(A, B), "GEMM-NN", double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");
123  BENCHMARK_OP(C = prod(A, trans(BT)), "GEMM-NT", double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");
124  BENCHMARK_OP(C = prod(trans(AT), B), "GEMM-TN", double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");
125  BENCHMARK_OP(C = prod(trans(AT), trans(BT)), "GEMM-TT", double(2*BLAS3_M*BLAS3_N*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");
126  //BENCHMARK_OP(lu_factorize(A), "LU-FACTORIZE", double(2*BLAS3_M*BLAS3_K*BLAS3_K)/time_spent*1e-9, "GFLOPs/s");
127  }
128 
129 
130 }
131 
132 int main()
133 {
134 #ifdef VIENNACL_WITH_OPENCL
135  std::cout << std::endl;
136  std::cout << "----------------------------------------------" << std::endl;
137  std::cout << " Device Info" << std::endl;
138  std::cout << "----------------------------------------------" << std::endl;
139  std::cout << std::endl;
140  std::cout << viennacl::ocl::current_device().info() << std::endl;
141  std::cout << std::endl;
142 #endif
143 
144  std::size_t BLAS1_N = 10000000;
145 
146  std::size_t BLAS2_M = 3840;
147  std::size_t BLAS2_N = 3840;
148 
149  std::size_t BLAS3_M = 1976;
150  std::size_t BLAS3_N = 1976;
151  std::size_t BLAS3_K = 1976;
152 
153  std::cout << "Benchmark : BLAS" << std::endl;
154  std::cout << "----------------" << std::endl;
155  bench<float>(BLAS1_N, BLAS2_M, BLAS2_N, BLAS3_M, BLAS3_N, BLAS3_K, "s");
156  std::cout << "----" << std::endl;
157 #ifdef VIENNACL_WITH_OPENCL
159 #endif
160  bench<double>(BLAS1_N, BLAS2_M, BLAS2_N, BLAS3_M, BLAS3_N, BLAS3_K, "d");
161 }
void init_random(viennacl::matrix< T, F > &M)
Definition: dense_blas.cpp:33
This class represents a single scalar value on the GPU and behaves mostly like a built-in scalar type...
Definition: forwards.h:226
viennacl::enable_if< viennacl::is_any_sparse_matrix< M1 >::value, matrix_expression< const M1, const M1, op_trans > >::type trans(const M1 &mat)
Returns an expression template class representing a transposed matrix.
void bench(size_t BLAS1_N, size_t BLAS2_M, size_t BLAS2_N, size_t BLAS3_M, size_t BLAS3_N, size_t BLAS3_K, std::string const &prefix)
Definition: dense_blas.cpp:52
size_type internal_size() const
Returns the total amount of allocated memory in multiples of sizeof(NumericT)
Definition: matrix_def.hpp:233
void trans(matrix_expression< const matrix_base< NumericT, SizeT, DistanceT >, const matrix_base< NumericT, SizeT, DistanceT >, op_trans > const &proxy, matrix_base< NumericT > &temp_trans)
Generic interface for matrix-vector and matrix-matrix products. See viennacl/linalg/vector_operations...
Implementation of the dense matrix class.
A dense matrix class.
Definition: forwards.h:374
viennacl::enable_if< viennacl::is_stl< typename viennacl::traits::tag_of< VectorT1 >::type >::value, typename VectorT1::value_type >::type inner_prod(VectorT1 const &v1, VectorT2 const &v2)
Definition: inner_prod.hpp:89
viennacl::ocl::device const & current_device()
Convenience function for returning the active device in the current context.
Definition: backend.hpp:351
Generic interface for the computation of inner products. See viennacl/linalg/vector_operations.hpp for implementations.
std::string info(vcl_size_t indent=0, char indent_char= ' ') const
Returns an info string with a few properties of the device. Use full_info() to get all details...
Definition: device.hpp:995
#define BENCHMARK_OP(OPERATION, NAME, PERF, INDEX)
VectorT prod(std::vector< std::vector< T, A1 >, A2 > const &matrix, VectorT const &vector)
Definition: prod.hpp:91
iterator begin()
Returns an iterator pointing to the beginning of the vector (STL like)
bool double_support() const
ViennaCL convenience function: Returns true if the device supports double precision.
Definition: device.hpp:956
size_type size2() const
Returns the number of columns.
Definition: matrix_def.hpp:217
void prod(const MatrixT1 &A, bool transposed_A, const MatrixT2 &B, bool transposed_B, MatrixT3 &C, ScalarT alpha, ScalarT beta)
Implementations of LU factorization for row-major and column-major dense matrices.
size_type size1() const
Returns the number of rows.
Definition: matrix_def.hpp:215
Proxy classes for vectors.
Proxy classes for matrices.
The vector type with operator-overloads and proxy classes is defined here. Linear algebra operations ...
size_type internal_size2() const
Returns the internal number of columns. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:231
int main()
Definition: dense_blas.cpp:132
size_type internal_size1() const
Returns the internal number of rows. Usually required for launching OpenCL kernels only...
Definition: matrix_def.hpp:229
size_type internal_size() const
Returns the internal length of the vector, which is given by size() plus the extra memory due to padd...
Definition: vector_def.hpp:120
void lu_factorize(matrix< NumericT, viennacl::row_major > &A)
LU factorization of a row-major dense matrix.
Definition: lu.hpp:42
void fast_copy(const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_begin, const const_vector_iterator< SCALARTYPE, ALIGNMENT > &gpu_end, CPU_ITERATOR cpu_begin)