ViennaCL - The Vienna Computing Library
1.5.2
|
00001 #ifndef VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_ 00002 #define VIENNACL_LINALG_HOST_BASED_SSE_BLAS_HPP_ 00003 00004 /* ========================================================================= 00005 Copyright (c) 2010-2014, Institute for Microelectronics, 00006 Institute for Analysis and Scientific Computing, 00007 TU Wien. 00008 Portions of this software are copyright by UChicago Argonne, LLC. 00009 00010 ----------------- 00011 ViennaCL - The Vienna Computing Library 00012 ----------------- 00013 00014 Project Head: Karl Rupp rupp@iue.tuwien.ac.at 00015 00016 (A list of authors and contributors can be found in the PDF manual) 00017 00018 License: MIT (X11), see file LICENSE in the base directory 00019 ============================================================================= */ 00020 00027 //complex BLAS functions are included but unused in this version of ViennaCL 00028 #if defined VIENNACL_WITH_COMPLEX 00029 #include <complex> 00030 #endif 00031 00032 //defining VIENNACL_SSE3 adds a slight optimization for complex multiplication using SSE3 00033 #if defined VIENNACL_WITH_SSE3 00034 #include <pmmintrin.h> 00035 #elif defined VIENNACL_WITH_SSE2 00036 #include <emmintrin.h> 00037 #endif 00038 00039 #include <cstddef> 00040 #include <cmath> 00041 00042 namespace viennacl 00043 { 00044 namespace linalg 00045 { 00046 00047 namespace host_based 00048 { 00049 //saxpy, daxpy, caxpy, zaxpy 00050 template <class T> inline void _axpy(const T*, T*, vcl_size_t, T); 00051 00052 //sdot, ddot, cdotu, zdotu 00053 template <class T> inline T _dot (vcl_size_t, const T*, const T*); 00054 00055 //sdot, ddot, cdotc, zdotc 00056 template <class T> inline T _dotc(vcl_size_t, const T*, const T*); 00057 00058 //sswap, dswap, cswap, zswap 00059 template <class T> inline void _swap(vcl_size_t, T*, T*); 00060 00061 //scopy, dcopy, ccopy, zcopy 00062 template <class T> inline void _copy(vcl_size_t, T*, T*); 00063 00064 //snrm2, dnrm2, euclidian norm of complex vectors 00065 template <class T> inline T _nrm2(const T*, vcl_size_t); 00066 00067 namespace detail 00068 { 00069 template <class T> inline T conjIfComplex(T x){return x;} 00070 } 00071 00072 template <class T> 00073 inline void _axpy(const T* x, T* y, vcl_size_t n, T a) 00074 { 00075 for(vcl_size_t i=0;i<n;i++) 00076 y[i]+=a*x[i]; 00077 } 00078 00079 template <class T> 00080 inline T _dot(vcl_size_t n, const T* x, const T* y) 00081 { 00082 T sum(0); 00083 for(vcl_size_t i=0;i<n;i++) 00084 sum+=x[i]*y[i]; 00085 return sum; 00086 } 00087 00088 template <class T> 00089 inline T _dotc(vcl_size_t n, const T* x, const T* y) 00090 { 00091 T sum(0); 00092 for(vcl_size_t i=0;i<n;i++) 00093 sum+=detail::conjIfComplex(x[i])*y[i]; 00094 return sum; 00095 } 00096 00097 template <class T> 00098 inline void _swap(vcl_size_t n, T* sx, T* sy) 00099 { 00100 T t; 00101 for(vcl_size_t i=0;i<n;i++) 00102 { 00103 t=sx[i]; 00104 sx[i]=sy[i]; 00105 sy[i]=t; 00106 } 00107 } 00108 00109 template <class T> 00110 inline void _copy(vcl_size_t n, T* cx, T* cy) 00111 { 00112 for(vcl_size_t i=0;i<n;i++) 00113 cx[i]=cy[i]; 00114 } 00115 00116 template <class T> 00117 inline T _nrm2(const T* x, vcl_size_t n) 00118 { 00119 //based on http://www.netlib.org/blas/snrm2.f, but works with std::complex 00120 00121 if(n<1) 00122 return T(0); 00123 if(n==1) 00124 return std::abs(x[0]); 00125 T scale(0); 00126 T scaledSquareSum(1); 00127 for(vcl_size_t i=0;i<n;i++){ 00128 if(x[i]!=T(0)){ 00129 T absXi=std::abs(x[i]); 00130 if(std::abs(x[i])>std::abs(scale)){ 00131 T temp=scale/absXi; 00132 scaledSquareSum=T(1)+scaledSquareSum*temp*temp; 00133 scale=absXi; 00134 } 00135 else{ 00136 T temp=absXi/scale; 00137 scaledSquareSum+=temp*temp; 00138 } 00139 } 00140 } 00141 return scale*sqrt(scaledSquareSum); 00142 } 00143 00144 #if defined VIENNACL_WITH_COMPLEX 00145 00146 namespace detail 00147 { 00148 template <> inline std::complex<double> conjIfComplex(std::complex<double> x){return conj(x);} 00149 template <> inline std::complex<float > conjIfComplex(std::complex<float > x){return conj(x);} 00150 } 00151 00152 template <> 00153 inline std::complex<double> _nrm2(const std::complex<double>* x, vcl_size_t n) 00154 { 00155 //based on http://www.netlib.org/blas/snrm2.f 00156 00157 if(n<1) 00158 return std::complex<double>(0); 00159 if(n==1) 00160 return std::abs(x[0]); 00161 double scale=0.0; 00162 double scaledSquareSum=1.0; 00163 for(vcl_size_t i=0;i<n;i++){ 00164 if(x[i].real()!=0.0){ 00165 double absXi=std::abs(x[i].real()); 00166 if(absXi>scale){ 00167 double temp=scale/absXi; 00168 scaledSquareSum=1.0+scaledSquareSum*temp*temp; 00169 scale=absXi; 00170 } 00171 else{ 00172 double temp=absXi/scale; 00173 scaledSquareSum+=temp*temp; 00174 } 00175 } 00176 if(x[i].imag()!=0.0){ 00177 double absXi=std::abs(x[i].imag()); 00178 if(absXi>scale){ 00179 double temp=scale/absXi; 00180 scaledSquareSum=1.0+scaledSquareSum*temp*temp; 00181 scale=absXi; 00182 } 00183 else{ 00184 double temp=absXi/scale; 00185 scaledSquareSum+=temp*temp; 00186 } 00187 } 00188 } 00189 return std::complex<double>(scale*sqrt(scaledSquareSum)); 00190 } 00191 00192 template <> 00193 inline std::complex<float> _nrm2(const std::complex<float>* x, vcl_size_t n) 00194 { 00195 //based on http://www.netlib.org/blas/snrm2.f 00196 00197 if(n<1) 00198 return std::complex<float>(0); 00199 if(n==1) 00200 return std::abs(x[0]); 00201 float scale=0.0; 00202 float scaledSquareSum=1.0; 00203 for(vcl_size_t i=0;i<n;i++){ 00204 if(x[i].real()!=0.0){ 00205 float absXi=std::abs(x[i].real()); 00206 if(absXi>scale){ 00207 float temp=scale/absXi; 00208 scaledSquareSum=1.0f+scaledSquareSum*temp*temp; 00209 scale=absXi; 00210 } 00211 else{ 00212 float temp=absXi/scale; 00213 scaledSquareSum+=temp*temp; 00214 } 00215 } 00216 if(x[i].imag()!=0.0){ 00217 float absXi=std::abs(x[i].imag()); 00218 if(absXi>scale){ 00219 float temp=scale/absXi; 00220 scaledSquareSum=1.0f+scaledSquareSum*temp*temp; 00221 scale=absXi; 00222 } 00223 else{ 00224 float temp=absXi/scale; 00225 scaledSquareSum+=temp*temp; 00226 } 00227 } 00228 } 00229 return std::complex<float>(scale*sqrt(scaledSquareSum)); 00230 } 00231 00232 #endif //defined VIENNACL_COMPLEX 00233 00234 #if defined VIENNACL_WITH_SSE2 00235 00236 //saxpy 00237 template <> 00238 inline void _axpy<float>(const float* x, float* y, vcl_size_t n, float a) 00239 { 00240 00241 //if the array is short or if either array is unaligned, perform the non-SSE code 00242 if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0) 00243 for(vcl_size_t i=0;i<n;i++) 00244 y[i]+=a*x[i]; 00245 else 00246 { 00247 //process unaligned section of arrays 00248 while(((vcl_size_t)x)%16) 00249 { 00250 if(n<=0) 00251 return; 00252 y[0]+=a*x[0]; 00253 x++; 00254 y++; 00255 n--; 00256 } 00257 00258 __m128 sum0; 00259 __m128 sum1; 00260 __m128 reg0,reg1,reg2,reg3; 00261 __m128 areg=_mm_set1_ps(a); 00262 __m128 prod; 00263 00264 //add floats 8 at a time 00265 while(n>=8){ 00266 00267 //read floats into MMX registers (8 from each array) 00268 reg0=_mm_load_ps(x+0); 00269 reg1=_mm_load_ps(x+4); 00270 reg2=_mm_load_ps(y+0); 00271 reg3=_mm_load_ps(y+4); 00272 00273 //add floats 00274 prod=_mm_mul_ps(reg0,areg); 00275 sum0=_mm_add_ps(prod,reg2); 00276 prod=_mm_mul_ps(reg1,areg); 00277 sum1=_mm_add_ps(prod,reg3); 00278 00279 //put float sums into y 00280 _mm_store_ps(y+0,sum0); 00281 _mm_store_ps(y+4,sum1); 00282 00283 x+=8; 00284 y+=8; 00285 n-=8; 00286 } 00287 00288 //add beyond the last multiple of 8 00289 for(vcl_size_t i=0;i<n;i++) 00290 y[i]+=a*x[i]; 00291 } 00292 } 00293 00294 //daxpy 00295 template <> 00296 inline void _axpy<double>(const double* x, double* y, vcl_size_t n, double a) 00297 { 00298 //if the array is short or if either array is unaligned, perform the non-SSE code 00299 if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0) 00300 for(vcl_size_t i=0;i<n;i++) 00301 y[i]+=a*x[i]; 00302 00303 else 00304 { 00305 //process unaligned section of arrays 00306 while(((vcl_size_t)x)%16) 00307 { 00308 if(n<=0) 00309 return; 00310 y[0]+=a*x[0]; 00311 x++; 00312 y++; 00313 n--; 00314 } 00315 00316 __m128d sum0; 00317 __m128d sum1; 00318 __m128d reg0,reg1,reg2,reg3; 00319 __m128d areg=_mm_set1_pd(a); 00320 __m128d prod; 00321 00322 //add doubles 4 at a time 00323 while(n>=8){ 00324 00325 //read floats into MMX registers (4 from each array) 00326 reg0=_mm_load_pd(x+0); 00327 reg1=_mm_load_pd(x+2); 00328 reg2=_mm_load_pd(y+0); 00329 reg3=_mm_load_pd(y+2); 00330 00331 //add floats 00332 prod=_mm_mul_pd(reg0,areg); 00333 sum0=_mm_add_pd(prod,reg2); 00334 prod=_mm_mul_pd(reg1,areg); 00335 sum1=_mm_add_pd(prod,reg3); 00336 00337 //put float sums into y 00338 _mm_store_pd(y+0,sum0); 00339 _mm_store_pd(y+2,sum1); 00340 00341 x+=4; 00342 y+=4; 00343 n-=4; 00344 } 00345 00346 //add beyond the last multiple of 4 00347 for(vcl_size_t i=0;i<n;i++) 00348 y[i]+=a*x[i]; 00349 } 00350 } 00351 00352 //sdot 00353 template <> 00354 inline float _dot<float>(vcl_size_t n, const float* x, const float* y) 00355 { 00356 00357 //if the array is short or if either array is unaligned, perform the non-SSE code 00358 if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(float)!=0) 00359 { 00360 float sum=0; 00361 for(vcl_size_t i=0;i<n;i++) 00362 sum+=x[i]*y[i]; 00363 return sum; 00364 } 00365 else 00366 { 00367 00368 //process unaligned section of array 00369 float sum=0; 00370 while(((vcl_size_t)x)%16) 00371 { 00372 if(n<=0) 00373 return sum; 00374 sum+=x[0]*y[0]; 00375 y++; 00376 x++; 00377 n--; 00378 } 00379 00380 __m128 sumReg=_mm_setzero_ps(); 00381 __m128 reg0,reg1,reg2,reg3; 00382 00383 //add floats 8 at a time 00384 while(n>=8) 00385 { 00386 //read floats into MMX registers (8 from each array) 00387 reg0=_mm_load_ps(x+0); 00388 reg1=_mm_load_ps(x+4); 00389 reg2=_mm_load_ps(y+0); 00390 reg3=_mm_load_ps(y+4); 00391 00392 //multiply floats together 00393 reg0=_mm_mul_ps(reg0,reg2); 00394 reg1=_mm_mul_ps(reg1,reg3); 00395 00396 //add to sums 00397 sumReg=_mm_add_ps(sumReg,reg0); 00398 sumReg=_mm_add_ps(sumReg,reg1); 00399 00400 x+=8; 00401 y+=8; 00402 n-=8; 00403 } 00404 00405 //add beyond where the inner loop stopped 00406 for(vcl_size_t i=0;i<n;i++) 00407 sum+=x[i]*y[i]; 00408 00409 //move the sums from the xmm registers to aligned memory on the stack 00410 float sums[8]; 00411 float* pSums=(float*)((((vcl_size_t)sums)&(~15))+16); 00412 _mm_store_ps(pSums,sumReg); 00413 00414 return sum+pSums[0]+pSums[1]+pSums[2]+pSums[3]; 00415 } 00416 } 00417 00418 //ddot 00419 template <> 00420 inline double _dot(vcl_size_t n, const double* x, const double* y) 00421 { 00422 //if the array is short or if either array is unaligned, perform the non-SSE code 00423 if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(double)!=0) 00424 { 00425 double sum=0; 00426 for(vcl_size_t i=0;i<n;i++) 00427 sum+=x[i]*y[i]; 00428 return sum; 00429 } 00430 else 00431 { 00432 //process unaligned section of array 00433 double sum=0; 00434 while(((vcl_size_t)x)%16) 00435 { 00436 if(n<=0) 00437 return sum; 00438 sum+=x[0]*y[0]; 00439 y++; 00440 x++; 00441 n--; 00442 } 00443 00444 __m128d sum0=_mm_setzero_pd(); 00445 __m128d sum1=_mm_setzero_pd(); 00446 __m128d reg0,reg1,reg2,reg3; 00447 00448 //add doubles 4 at a time 00449 while(n>=4) 00450 { 00451 //read doubles into MMX registers (4 from each array) 00452 reg0=_mm_load_pd(x+0); 00453 reg1=_mm_load_pd(x+2); 00454 reg2=_mm_load_pd(y+0); 00455 reg3=_mm_load_pd(y+2); 00456 00457 //multiply doubles together 00458 reg0=_mm_mul_pd(reg0,reg2); 00459 reg1=_mm_mul_pd(reg1,reg3); 00460 00461 //add to sums 00462 sum0=_mm_add_pd(sum0,reg0); 00463 sum1=_mm_add_pd(sum1,reg1); 00464 00465 x+=4; 00466 y+=4; 00467 n-=4; 00468 } 00469 00470 //add beyond where the inner loop stopped 00471 for(vcl_size_t i=0;i<n;i++) 00472 sum+=x[i]*y[i]; 00473 00474 //move the sums from the xmm registers to aligned memory on the stack 00475 double sums[4]; 00476 double* pSums=(double*)((((vcl_size_t)sums)&(~15))+16); 00477 sum0=_mm_add_pd(sum0,sum1); 00478 _mm_store_pd(pSums,sum0); 00479 00480 return sum+pSums[0]+pSums[1]; 00481 } 00482 } 00483 00484 //conjugated dot products are the same as non-conjugated dot products for real numbers 00485 template <> inline float _dotc<float >(vcl_size_t n, const float *x, const float *y){return _dot(n,x,y);} 00486 template <> inline double _dotc<double>(vcl_size_t n, const double *x, const double *y){return _dot(n,x,y);} 00487 00488 #if defined VIENNACL_WITH_COMPLEX 00489 00490 //caxpy 00491 template <> 00492 inline void _axpy<std::complex<float> >(const std::complex<float>* x, std::complex<float>* y, vcl_size_t n, std::complex<float> a) 00493 { 00494 //if the array is short or if either array is unaligned, perform the non-SSE code 00495 if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0) 00496 for(vcl_size_t i=0;i<n;i++) 00497 y[i]+=a*x[i]; 00498 00499 else 00500 { 00501 //process unaligned section of arrays 00502 while(((vcl_size_t)x)%16) 00503 { 00504 if(n<=0) 00505 return; 00506 y[0]+=a*x[0]; 00507 x++; 00508 y++; 00509 n--; 00510 } 00511 00512 __m128 reg0,reg1,reg2,reg3,reg4; 00513 __m128 areg0=_mm_set_ps(a.imag(),a.real(),a.imag(),a.real()); 00514 __m128 areg1=_mm_set_ps(a.real(),a.imag(),a.real(),a.imag()); 00515 #ifndef VIENNACL_WITH_SSE3 00516 __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f); 00517 #endif 00518 00519 //add complex floats 4 at a time 00520 while(n>=4) 00521 { 00522 //read floats into MMX registers (8 from each array) 00523 reg0=_mm_load_ps((float*)(x+0)); 00524 reg1=_mm_load_ps((float*)(x+2)); 00525 reg2=_mm_load_ps((float*)(y+0)); 00526 reg3=_mm_load_ps((float*)(y+2)); 00527 00528 //do complex multiplication and addition 00529 #ifndef VIENNACL_WITH_SSE3 00530 reg4=_mm_shuffle_ps(reg0,reg0,0xA0); 00531 reg0=_mm_shuffle_ps(reg0,reg0,0xF5); 00532 reg4=_mm_mul_ps(reg4,areg0); 00533 reg0=_mm_mul_ps(reg0,areg1); 00534 reg0=_mm_mul_ps(reg0,nreg); 00535 reg0=_mm_add_ps(reg4,reg0); 00536 reg0=_mm_add_ps(reg0,reg2); 00537 reg4=_mm_shuffle_ps(reg1,reg1,0xA0); 00538 reg1=_mm_shuffle_ps(reg1,reg1,0xF5); 00539 reg4=_mm_mul_ps(reg4,areg0); 00540 reg1=_mm_mul_ps(reg1,areg1); 00541 reg1=_mm_mul_ps(reg1,nreg); 00542 reg1=_mm_add_ps(reg4,reg1); 00543 reg1=_mm_add_ps(reg1,reg3); 00544 #else 00545 reg4=_mm_moveldup_ps(reg0); 00546 reg0=_mm_movehdup_ps(reg0); 00547 reg4=_mm_mul_ps(reg4,areg0); 00548 reg0=_mm_mul_ps(reg0,areg1); 00549 reg0=_mm_addsub_ps(reg4,reg0); 00550 reg0=_mm_add_ps(reg0,reg2); 00551 reg4=_mm_moveldup_ps(reg1); 00552 reg1=_mm_movehdup_ps(reg1); 00553 reg4=_mm_mul_ps(reg4,areg0); 00554 reg1=_mm_mul_ps(reg1,areg1); 00555 reg1=_mm_addsub_ps(reg4,reg1); 00556 reg1=_mm_add_ps(reg1,reg3); 00557 #endif 00558 //put results into y 00559 _mm_store_ps((float*)(y+0),reg0); 00560 _mm_store_ps((float*)(y+2),reg1); 00561 00562 x+=4; 00563 y+=4; 00564 n-=4; 00565 } 00566 00567 //add beyond the last multiple of 4 00568 for(vcl_size_t i=0;i<n;i++) 00569 y[i]+=a*x[i]; 00570 } 00571 } 00572 00573 //zaxpy 00574 template <> 00575 inline void _axpy<std::complex<double> >(const std::complex<double>* x, std::complex<double>* y, vcl_size_t n, std::complex<double> a) 00576 { 00577 //if the array is short or if either array is unaligned, perform the non-SSE code 00578 if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16) 00579 for(vcl_size_t i=0;i<n;i++) 00580 y[i]+=a*x[i]; 00581 00582 else 00583 { 00584 __m128d reg0,reg1,reg2,reg3,reg4; 00585 __m128d areg0=_mm_set_pd(a.imag(),a.real()); 00586 __m128d areg1=_mm_set_pd(a.real(),a.imag()); 00587 #ifndef VIENNACL_WITH_SSE3 00588 __m128d nreg=_mm_set_pd(1.0,-1.0); 00589 #endif 00590 00591 //add complex doubles 2 at a time 00592 while(n>=2) 00593 { 00594 //read doubles into MMX registers (4 from each array) 00595 reg0=_mm_load_pd((double*)(x+0)); 00596 reg1=_mm_load_pd((double*)(x+1)); 00597 reg2=_mm_load_pd((double*)(y+0)); 00598 reg3=_mm_load_pd((double*)(y+1)); 00599 00600 //do complex multiplication and addition 00601 #ifndef VIENNACL_WITH_SSE3 00602 reg4=_mm_shuffle_pd(reg0,reg0,0x0); 00603 reg0=_mm_shuffle_pd(reg0,reg0,0x3); 00604 reg4=_mm_mul_pd(reg4,areg0); 00605 reg0=_mm_mul_pd(reg0,areg1); 00606 reg0=_mm_mul_pd(reg0,nreg); 00607 reg0=_mm_add_pd(reg4,reg0); 00608 reg0=_mm_add_pd(reg0,reg2); 00609 reg4=_mm_shuffle_pd(reg1,reg1,0x0); 00610 reg1=_mm_shuffle_pd(reg1,reg1,0x3); 00611 reg4=_mm_mul_pd(reg4,areg0); 00612 reg1=_mm_mul_pd(reg1,areg1); 00613 reg1=_mm_mul_pd(reg1,nreg); 00614 reg1=_mm_add_pd(reg4,reg1); 00615 reg1=_mm_add_pd(reg1,reg3); 00616 #else 00617 reg4=_mm_shuffle_pd(reg0,reg0,0x0); 00618 reg0=_mm_shuffle_pd(reg0,reg0,0x3); 00619 reg4=_mm_mul_pd(reg4,areg0); 00620 reg0=_mm_mul_pd(reg0,areg1); 00621 reg0=_mm_addsub_pd(reg4,reg0); 00622 reg0=_mm_add_pd(reg0,reg2); 00623 reg4=_mm_shuffle_pd(reg1,reg1,0x0); 00624 reg1=_mm_shuffle_pd(reg1,reg1,0x3); 00625 reg4=_mm_mul_pd(reg4,areg0); 00626 reg1=_mm_mul_pd(reg1,areg1); 00627 reg1=_mm_addsub_pd(reg4,reg1); 00628 reg1=_mm_add_pd(reg1,reg3); 00629 #endif 00630 //put results into y 00631 _mm_store_pd((double*)(y+0),reg0); 00632 _mm_store_pd((double*)(y+1),reg1); 00633 00634 x+=2; 00635 y+=2; 00636 n-=2; 00637 } 00638 00639 //add beyond the last multiple of 2 00640 if(n) 00641 y[0]+=a*x[0]; 00642 } 00643 } 00644 00645 //cdotu 00646 template <> 00647 inline std::complex<float> _dot<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y) 00648 { 00649 //if the array is short or if either array is unaligned, perform the non-SSE code 00650 if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0) 00651 { 00652 std::complex<float> sum(0); 00653 for(vcl_size_t i=0;i<n;i++) 00654 sum+=x[i]*y[i]; 00655 return sum; 00656 } 00657 else 00658 { 00659 //process unaligned section of arrays 00660 std::complex<float> sum(0); 00661 while(((vcl_size_t)x)%16) 00662 { 00663 if(n<=0) 00664 return sum; 00665 sum+=x[0]*y[0]; 00666 y++; 00667 x++; 00668 n--; 00669 } 00670 00671 __m128 sumReg=_mm_setzero_ps(); 00672 __m128 reg0,reg1,reg2,reg3,reg4; 00673 #ifndef VIENNACL_WITH_SSE3 00674 __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f); 00675 #endif 00676 00677 //add complex floats 4 at a time 00678 while(n>=4) 00679 { 00680 //read floats into MMX registers (8 from each array) 00681 reg0=_mm_load_ps((float*)(x+0)); 00682 reg1=_mm_load_ps((float*)(x+2)); 00683 reg2=_mm_load_ps((float*)(y+0)); 00684 reg3=_mm_load_ps((float*)(y+2)); 00685 00686 //multiply complex floats together 00687 #ifndef VIENNACL_WITH_SSE3 00688 reg4=_mm_shuffle_ps(reg2,reg2,0xA0); 00689 reg2=_mm_shuffle_ps(reg2,reg2,0xF5); 00690 reg4=_mm_mul_ps(reg4,reg0); 00691 reg2=_mm_mul_ps(reg2,reg0); 00692 reg2=_mm_shuffle_ps(reg2,reg2,0xB1); 00693 reg2=_mm_mul_ps(reg2,nreg); 00694 reg0=_mm_add_ps(reg4,reg2); 00695 reg4=_mm_shuffle_ps(reg3,reg3,0xA0); 00696 reg3=_mm_shuffle_ps(reg3,reg3,0xF5); 00697 reg4=_mm_mul_ps(reg4,reg1); 00698 reg3=_mm_mul_ps(reg3,reg1); 00699 reg3=_mm_shuffle_ps(reg3,reg3,0xB1); 00700 reg3=_mm_mul_ps(reg3,nreg); 00701 reg1=_mm_add_ps(reg4,reg3); 00702 #else 00703 reg4=_mm_moveldup_ps(reg2); 00704 reg2=_mm_movehdup_ps(reg2); 00705 reg4=_mm_mul_ps(reg4,reg0); 00706 reg2=_mm_mul_ps(reg2,reg0); 00707 reg2=_mm_shuffle_ps(reg2,reg2,0xB1); 00708 reg0=_mm_addsub_ps(reg4,reg2); 00709 reg4=_mm_moveldup_ps(reg3); 00710 reg3=_mm_movehdup_ps(reg3); 00711 reg4=_mm_mul_ps(reg4,reg1); 00712 reg3=_mm_mul_ps(reg3,reg1); 00713 reg3=_mm_shuffle_ps(reg3,reg3,0xB1); 00714 reg1=_mm_addsub_ps(reg4,reg3); 00715 #endif 00716 00717 //add to sum 00718 sumReg=_mm_add_ps(sumReg,reg0); 00719 sumReg=_mm_add_ps(sumReg,reg1); 00720 00721 x+=4; 00722 y+=4; 00723 n-=4; 00724 } 00725 00726 //add beyond where the inner loop stopped 00727 for(vcl_size_t i=0;i<n;i++) 00728 sum+=x[i]*y[i]; 00729 00730 //move the sums from the xmm registers to aligned memory on the stack 00731 std::complex<float> sums[4]; 00732 std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16); 00733 pSums[0]=std::complex<float>(0); 00734 pSums[1]=std::complex<float>(0); 00735 _mm_store_ps((float*)pSums,sumReg); 00736 00737 return sum+pSums[0]+pSums[1]; 00738 } 00739 } 00740 00741 //zdotu 00742 template <> 00743 inline std::complex<double> _dot<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y) 00744 { 00745 //if the array is short or if either array is unaligned, perform the non-SSE code 00746 if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16) 00747 { 00748 std::complex<double> sum(0); 00749 for(vcl_size_t i=0;i<n;i++) 00750 sum+=x[i]*y[i]; 00751 return sum; 00752 } 00753 else 00754 { 00755 __m128d sumReg=_mm_setzero_pd(); 00756 __m128d reg0,reg1,reg2,reg3,reg4; 00757 #ifndef VIENNACL_WITH_SSE3 00758 __m128d nreg=_mm_set_pd(1.0,-1.0); 00759 #endif 00760 00761 //add complex doubles 2 at a time 00762 while(n>=2) 00763 { 00764 //read doubles into MMX registers (4 from each array) 00765 reg0=_mm_load_pd((double*)(x+0)); 00766 reg1=_mm_load_pd((double*)(x+1)); 00767 reg2=_mm_load_pd((double*)(y+0)); 00768 reg3=_mm_load_pd((double*)(y+1)); 00769 00770 //multiply complex doubles together 00771 #ifndef VIENNACL_WITH_SSE3 00772 reg4=_mm_shuffle_pd(reg2,reg2,0x0); 00773 reg2=_mm_shuffle_pd(reg2,reg2,0x3); 00774 reg4=_mm_mul_pd(reg4,reg0); 00775 reg2=_mm_mul_pd(reg2,reg0); 00776 reg2=_mm_shuffle_pd(reg2,reg2,0x1); 00777 reg2=_mm_mul_pd(reg2,nreg); 00778 reg0=_mm_add_pd(reg4,reg2); 00779 reg4=_mm_shuffle_pd(reg3,reg3,0x0); 00780 reg3=_mm_shuffle_pd(reg3,reg3,0x3); 00781 reg4=_mm_mul_pd(reg4,reg1); 00782 reg3=_mm_mul_pd(reg3,reg1); 00783 reg3=_mm_shuffle_pd(reg3,reg3,0x1); 00784 reg3=_mm_mul_pd(reg3,nreg); 00785 reg1=_mm_add_pd(reg4,reg3); 00786 #else 00787 reg4=_mm_shuffle_pd(reg2,reg2,0x0); 00788 reg2=_mm_shuffle_pd(reg2,reg2,0x3); 00789 reg4=_mm_mul_pd(reg4,reg0); 00790 reg2=_mm_mul_pd(reg2,reg0); 00791 reg2=_mm_shuffle_pd(reg2,reg2,0x1); 00792 reg0=_mm_addsub_pd(reg4,reg2); 00793 reg4=_mm_shuffle_pd(reg3,reg3,0x0); 00794 reg3=_mm_shuffle_pd(reg3,reg3,0x3); 00795 reg4=_mm_mul_pd(reg4,reg1); 00796 reg3=_mm_mul_pd(reg3,reg1); 00797 reg3=_mm_shuffle_pd(reg3,reg3,0x1); 00798 reg1=_mm_addsub_pd(reg4,reg3); 00799 #endif 00800 00801 //add to sum 00802 sumReg=_mm_add_pd(sumReg,reg0); 00803 sumReg=_mm_add_pd(sumReg,reg1); 00804 00805 x+=2; 00806 y+=2; 00807 n-=2; 00808 } 00809 00810 //add beyond where the inner loop stopped 00811 std::complex<double> sum(0); 00812 if(n) 00813 sum=x[0]*y[0]; 00814 00815 //move the sums from the xmm registers to aligned memory on the stack 00816 std::complex<double> sums[2]; 00817 std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16); 00818 pSums[0]=std::complex<double>(0); 00819 _mm_store_pd((double*)pSums,sumReg); 00820 00821 return sum+pSums[0]; 00822 } 00823 } 00824 00825 //cdotc 00826 template <> 00827 inline std::complex<float> _dotc<std::complex<float> >(vcl_size_t n, const std::complex<float>* x, const std::complex<float>* y) 00828 { 00829 //if the array is short or if either array is unaligned, perform the non-SSE code 00830 if(n<16||((vcl_size_t)x)%16!=((vcl_size_t)y)%16||((vcl_size_t)x)%sizeof(std::complex<float>)!=0) 00831 { 00832 std::complex<float> sum(0); 00833 for(vcl_size_t i=0;i<n;i++) 00834 sum+=conj(x[i])*y[i]; 00835 return sum; 00836 } 00837 else 00838 { 00839 //process unaligned section of arrays 00840 std::complex<float> sum(0); 00841 while(((vcl_size_t)x)%16) 00842 { 00843 if(n<=0) 00844 return sum; 00845 sum+=conj(x[0])*y[0]; 00846 y++; 00847 x++; 00848 n--; 00849 } 00850 00851 __m128 sumReg=_mm_setzero_ps(); 00852 __m128 reg0,reg1,reg2,reg3,reg4; 00853 #ifndef VIENNACL_WITH_SSE3 00854 __m128 nreg=_mm_set_ps(1.0f,-1.0f,1.0f,-1.0f); 00855 #endif 00856 00857 //add complex floats 4 at a time 00858 while(n>=4) 00859 { 00860 //read floats into MMX registers (8 from each array) 00861 reg0=_mm_load_ps((float*)(x+0)); 00862 reg1=_mm_load_ps((float*)(x+2)); 00863 reg2=_mm_load_ps((float*)(y+0)); 00864 reg3=_mm_load_ps((float*)(y+2)); 00865 00866 //multiply complex doubles together 00867 #ifndef VIENNACL_WITH_SSE3 00868 reg4=_mm_shuffle_ps(reg2,reg2,0xA0); 00869 reg2=_mm_shuffle_ps(reg2,reg2,0xF5); 00870 reg4=_mm_mul_ps(reg4,reg0); 00871 reg2=_mm_mul_ps(reg2,reg0); 00872 reg4=_mm_shuffle_ps(reg4,reg4,0xB1); 00873 reg4=_mm_mul_ps(reg4,nreg); 00874 reg0=_mm_add_ps(reg4,reg2); 00875 reg4=_mm_shuffle_ps(reg3,reg3,0xA0); 00876 reg3=_mm_shuffle_ps(reg3,reg3,0xF5); 00877 reg4=_mm_mul_ps(reg4,reg1); 00878 reg3=_mm_mul_ps(reg3,reg1); 00879 reg4=_mm_shuffle_ps(reg4,reg4,0xB1); 00880 reg4=_mm_mul_ps(reg4,nreg); 00881 reg1=_mm_add_ps(reg4,reg3); 00882 #else 00883 reg4=_mm_moveldup_ps(reg2); 00884 reg2=_mm_movehdup_ps(reg2); 00885 reg4=_mm_mul_ps(reg4,reg0); 00886 reg2=_mm_mul_ps(reg2,reg0); 00887 reg4=_mm_shuffle_ps(reg4,reg4,0xB1); 00888 reg0=_mm_addsub_ps(reg2,reg4); 00889 reg4=_mm_moveldup_ps(reg3); 00890 reg3=_mm_movehdup_ps(reg3); 00891 reg4=_mm_mul_ps(reg4,reg1); 00892 reg3=_mm_mul_ps(reg3,reg1); 00893 reg4=_mm_shuffle_ps(reg4,reg4,0xB1); 00894 reg1=_mm_addsub_ps(reg3,reg4); 00895 #endif 00896 00897 //add to sum 00898 sumReg=_mm_add_ps(sumReg,reg0); 00899 sumReg=_mm_add_ps(sumReg,reg1); 00900 00901 x+=4; 00902 y+=4; 00903 n-=4; 00904 } 00905 00906 //add beyond where the inner loop stopped 00907 for(vcl_size_t i=0;i<n;i++) 00908 sum+=conj(x[i])*y[i]; 00909 00910 //move the sums from the xmm registers to aligned memory on the stack 00911 std::complex<float> sums[4]; 00912 std::complex<float>* pSums=(std::complex<float>*)((((vcl_size_t)sums)&(~15))+16); 00913 sumReg=_mm_shuffle_ps(sumReg,sumReg,0xB1);//swap real and imag 00914 _mm_store_ps((float*)pSums,sumReg); 00915 00916 return sum+pSums[0]+pSums[1]; 00917 } 00918 } 00919 00920 //zdotc 00921 template <> 00922 inline std::complex<double> _dotc<std::complex<double> >(vcl_size_t n, const std::complex<double>* x, const std::complex<double>* y) 00923 { 00924 //if the array is short or if either array is unaligned, perform the non-SSE code 00925 if(n<16||((vcl_size_t)x)%16||((vcl_size_t)y)%16) 00926 { 00927 std::complex<double> sum(0); 00928 for(vcl_size_t i=0;i<n;i++) 00929 sum+=conj(x[i])*y[i]; 00930 return sum; 00931 } 00932 else 00933 { 00934 __m128d sumReg=_mm_setzero_pd(); 00935 __m128d reg0,reg1,reg2,reg3,reg4; 00936 #ifndef VIENNACL_WITH_SSE3 00937 __m128d nreg=_mm_set_pd(1.0,-1.0); 00938 #endif 00939 00940 //add complex doubles 2 at a time 00941 while(n>=2) 00942 { 00943 //read doubles into MMX registers (4 from each array) 00944 reg0=_mm_load_pd((double*)(x+0)); 00945 reg1=_mm_load_pd((double*)(x+1)); 00946 reg2=_mm_load_pd((double*)(y+0)); 00947 reg3=_mm_load_pd((double*)(y+1)); 00948 00949 //multiply complex floats together 00950 #ifndef VIENNACL_WITH_SSE3 00951 reg4=_mm_shuffle_pd(reg2,reg2,0x0); 00952 reg2=_mm_shuffle_pd(reg2,reg2,0x3); 00953 reg4=_mm_mul_pd(reg4,reg0); 00954 reg2=_mm_mul_pd(reg2,reg0); 00955 reg4=_mm_shuffle_pd(reg4,reg4,0x1); 00956 reg4=_mm_mul_pd(reg4,nreg); 00957 reg0=_mm_add_pd(reg4,reg2); 00958 reg4=_mm_shuffle_pd(reg3,reg3,0x0); 00959 reg3=_mm_shuffle_pd(reg3,reg3,0x3); 00960 reg4=_mm_mul_pd(reg4,reg1); 00961 reg3=_mm_mul_pd(reg3,reg1); 00962 reg4=_mm_shuffle_pd(reg4,reg4,0x1); 00963 reg4=_mm_mul_pd(reg4,nreg); 00964 reg1=_mm_add_pd(reg4,reg3); 00965 #else 00966 reg4=_mm_shuffle_pd(reg2,reg2,0x0); 00967 reg2=_mm_shuffle_pd(reg2,reg2,0x3); 00968 reg4=_mm_mul_pd(reg4,reg0); 00969 reg2=_mm_mul_pd(reg2,reg0); 00970 reg4=_mm_shuffle_pd(reg4,reg4,0x1); 00971 reg0=_mm_addsub_pd(reg2,reg4); 00972 reg4=_mm_shuffle_pd(reg3,reg3,0x0); 00973 reg3=_mm_shuffle_pd(reg3,reg3,0x3); 00974 reg4=_mm_mul_pd(reg4,reg1); 00975 reg3=_mm_mul_pd(reg3,reg1); 00976 reg4=_mm_shuffle_pd(reg4,reg4,0x1); 00977 reg1=_mm_addsub_pd(reg3,reg4); 00978 00979 #endif 00980 00981 //add to sum 00982 sumReg=_mm_add_pd(sumReg,reg0); 00983 sumReg=_mm_add_pd(sumReg,reg1); 00984 00985 x+=2; 00986 y+=2; 00987 n-=2; 00988 } 00989 00990 //add beyond where the inner loop stopped 00991 std::complex<double> sum(0); 00992 if(n) 00993 sum=conj(x[0])*y[0]; 00994 00995 //move the sums from the xmm registers to aligned memory on the stack 00996 std::complex<double> sums[2]; 00997 std::complex<double>* pSums=(std::complex<double>*)((((vcl_size_t)sums)&(~15))+16); 00998 sumReg=_mm_shuffle_pd(sumReg,sumReg,0x1);//swap real and imag 00999 _mm_store_pd((double*)pSums,sumReg); 01000 01001 return sum+pSums[0]; 01002 } 01003 } 01004 01005 #endif //defined VIENNACL_WITH_COMPLEX 01006 01007 #endif //defined VIENNACL_WITH_SSE2 01008 01009 } //namespace host_based 01010 } //namespace linalg 01011 } //namespace viennacl 01012 01013 #endif