ViennaCL - The Vienna Computing Library  1.5.2
viennacl/linalg/qr-method.hpp
Go to the documentation of this file.
00001 #ifndef VIENNACL_LINALG_QR_METHOD_HPP_
00002 #define VIENNACL_LINALG_QR_METHOD_HPP_
00003 
00004 /* =========================================================================
00005    Copyright (c) 2010-2014, Institute for Microelectronics,
00006                             Institute for Analysis and Scientific Computing,
00007                             TU Wien.
00008    Portions of this software are copyright by UChicago Argonne, LLC.
00009 
00010                             -----------------
00011                   ViennaCL - The Vienna Computing Library
00012                             -----------------
00013 
00014    Project Head:    Karl Rupp                   rupp@iue.tuwien.ac.at
00015 
00016    (A list of authors and contributors can be found in the PDF manual)
00017 
00018    License:         MIT (X11), see file LICENSE in the base directory
00019 ============================================================================= */
00020 
00021 #include "viennacl/vector.hpp"
00022 #include "viennacl/matrix.hpp"
00023 
00024 #include "viennacl/linalg/qr-method-common.hpp"
00025 #include "viennacl/linalg/prod.hpp"
00026 
00027 #include <boost/numeric/ublas/vector.hpp>
00028 #include <boost/numeric/ublas/matrix.hpp>
00029 
00034 namespace viennacl
00035 {
00036   namespace linalg
00037   {
00038     namespace detail
00039     {
00040         template<typename MatrixType, typename VectorType>
00041         void givens_next(MatrixType& matrix,
00042                         VectorType& tmp1,
00043                         VectorType& tmp2,
00044                         int l,
00045                         int m
00046                       )
00047         {
00048           viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(matrix).context());
00049 
00050           typedef typename MatrixType::value_type                                   ScalarType;
00051           typedef typename viennacl::result_of::cpu_value_type<ScalarType>::type    CPU_ScalarType;
00052 
00053           viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<CPU_ScalarType>::program_name(), SVD_GIVENS_NEXT_KERNEL);
00054 
00055           kernel.global_work_size(0, viennacl::tools::align_to_multiple<cl_uint>(cl_uint(viennacl::traits::size1(matrix)), 256));
00056           kernel.local_work_size(0, 256);
00057 
00058           viennacl::ocl::enqueue(kernel(
00059                                         matrix,
00060                                         tmp1,
00061                                         tmp2,
00062                                         static_cast<cl_uint>(matrix.size1()),
00063                                         static_cast<cl_uint>(matrix.internal_size2()),
00064                                         static_cast<cl_uint>(l),
00065                                         static_cast<cl_uint>(m - 1)
00066                                 ));
00067         }
00068 
00069 
00070         // Symmetric tridiagonal QL algorithm.
00071         // This is derived from the Algol procedures tql2, by Bowdler, Martin, Reinsch, and Wilkinson,
00072         // Handbook for Auto. Comp., Vol.ii-Linear Algebra, and the corresponding Fortran subroutine in EISPACK.
00073         template <typename SCALARTYPE, unsigned int ALIGNMENT>
00074         void tql2(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> & Q,
00075                   boost::numeric::ublas::vector<SCALARTYPE> & d,
00076                   boost::numeric::ublas::vector<SCALARTYPE> & e)
00077         {
00078             int n = static_cast<int>(Q.size1());
00079 
00080             boost::numeric::ublas::vector<SCALARTYPE> cs(n), ss(n);
00081             viennacl::vector<SCALARTYPE> tmp1(n), tmp2(n);
00082 
00083             for (int i = 1; i < n; i++)
00084                 e(i - 1) = e(i);
00085 
00086             e(n - 1) = 0;
00087 
00088             SCALARTYPE f = 0;
00089             SCALARTYPE tst1 = 0;
00090             SCALARTYPE eps = 2 * static_cast<SCALARTYPE>(EPS);
00091 
00092             for (int l = 0; l < n; l++)
00093             {
00094                 // Find small subdiagonal element.
00095                 tst1 = std::max<SCALARTYPE>(tst1, std::fabs(d(l)) + std::fabs(e(l)));
00096                 int m = l;
00097                 while (m < n)
00098                 {
00099                     if (std::fabs(e(m)) <= eps * tst1)
00100                         break;
00101                     m++;
00102                 }
00103 
00104                 // If m == l, d(l) is an eigenvalue, otherwise, iterate.
00105                 if (m > l)
00106                 {
00107                     int iter = 0;
00108                     do
00109                     {
00110                         iter = iter + 1;  // (Could check iteration count here.)
00111 
00112                         // Compute implicit shift
00113                         SCALARTYPE g = d(l);
00114                         SCALARTYPE p = (d(l + 1) - g) / (2 * e(l));
00115                         SCALARTYPE r = pythag<SCALARTYPE>(p, 1);
00116                         if (p < 0)
00117                         {
00118                             r = -r;
00119                         }
00120 
00121                         d(l) = e(l) / (p + r);
00122                         d(l + 1) = e(l) * (p + r);
00123                         SCALARTYPE dl1 = d(l + 1);
00124                         SCALARTYPE h = g - d(l);
00125                         for (int i = l + 2; i < n; i++)
00126                         {
00127                             d(i) -= h;
00128                         }
00129 
00130                         f = f + h;
00131 
00132                         // Implicit QL transformation.
00133                         p = d(m);
00134                         SCALARTYPE c = 1;
00135                         SCALARTYPE c2 = c;
00136                         SCALARTYPE c3 = c;
00137                         SCALARTYPE el1 = e(l + 1);
00138                         SCALARTYPE s = 0;
00139                         SCALARTYPE s2 = 0;
00140                         for (int i = m - 1; i >= l; i--)
00141                         {
00142                             c3 = c2;
00143                             c2 = c;
00144                             s2 = s;
00145                             g = c * e(i);
00146                             h = c * p;
00147                             r = pythag(p, e(i));
00148                             e(i + 1) = s * r;
00149                             s = e(i) / r;
00150                             c = p / r;
00151                             p = c * d(i) - s * g;
00152                             d(i + 1) = h + s * (c * g + s * d(i));
00153 
00154                             cs[i] = c;
00155                             ss[i] = s;
00156                         }
00157 
00158                         p = -s * s2 * c3 * el1 * e(l) / dl1;
00159                         e(l) = s * p;
00160                         d(l) = c * p;
00161 
00162                         {
00163                             viennacl::copy(cs, tmp1);
00164                             viennacl::copy(ss, tmp2);
00165 
00166                             givens_next(Q, tmp1, tmp2, l, m);
00167                         }
00168 
00169                         // Check for convergence.
00170                     }
00171                     while (std::fabs(e(l)) > eps * tst1);
00172                 }
00173                 d(l) = d(l) + f;
00174                 e(l) = 0;
00175             }
00176         }
00177 
00178         template <typename SCALARTYPE, typename MatrixT>
00179         void final_iter_update_gpu(MatrixT& A,
00180                                 int n,
00181                                 int last_n,
00182                                 SCALARTYPE q,
00183                                 SCALARTYPE p
00184                                 )
00185         {
00186             viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00187 
00188             viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_FINAL_ITER_UPDATE_KERNEL);
00189 
00190             viennacl::ocl::enqueue(kernel(
00191                                           A,
00192                                           static_cast<cl_uint>(A.internal_size1()),
00193                                           static_cast<cl_uint>(n),
00194                                           static_cast<cl_uint>(last_n),
00195                                           q,
00196                                           p
00197                                   ));
00198         }
00199 
00200         template <typename SCALARTYPE, typename MatrixT>
00201         void update_float_QR_column_gpu(MatrixT& A,
00202                                 const std::vector<SCALARTYPE>& buf,
00203                                 viennacl::vector<SCALARTYPE>& buf_vcl,
00204                                 int m,
00205                                 int n,
00206                                 int last_n,
00207                                 bool //is_triangular
00208                                 )
00209         {
00210             viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00211 
00212             viennacl::fast_copy(buf, buf_vcl);
00213 
00214             viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_UPDATE_QR_COLUMN_KERNEL);
00215 
00216             viennacl::ocl::enqueue(kernel(
00217                                           A,
00218                                           static_cast<cl_uint>(A.internal_size1()),
00219                                           buf_vcl,
00220                                           static_cast<cl_uint>(m),
00221                                           static_cast<cl_uint>(n),
00222                                           static_cast<cl_uint>(last_n)
00223                                   ));
00224         }
00225 
00226         template <typename SCALARTYPE, typename MatrixT>
00227         void final_iter_update(MatrixT& A,
00228                                 int n,
00229                                 int last_n,
00230                                 SCALARTYPE q,
00231                                 SCALARTYPE p
00232                                 )
00233         {
00234             for (int i = 0; i < last_n; i++)
00235             {
00236                 SCALARTYPE v_in = A(i, n);
00237                 SCALARTYPE z = A(i, n - 1);
00238                 A(i, n - 1) = q * z + p * v_in;
00239                 A(i, n) = q * v_in - p * z;
00240             }
00241         }
00242 
00243         template <typename SCALARTYPE, typename MatrixT>
00244         void update_float_QR_column(MatrixT& A,
00245                                 const std::vector<SCALARTYPE>& buf,
00246                                 int m,
00247                                 int n,
00248                                 int last_i,
00249                                 bool is_triangular
00250                                 )
00251         {
00252             for (int i = 0; i < last_i; i++)
00253             {
00254                 int start_k = is_triangular?std::max(i + 1, m):m;
00255 
00256                 SCALARTYPE* a_row = A.row(i);
00257 
00258                 SCALARTYPE a_ik   = a_row[start_k];
00259                 SCALARTYPE a_ik_1 = 0;
00260                 SCALARTYPE a_ik_2 = 0;
00261 
00262                 if(start_k < n)
00263                     a_ik_1 = a_row[start_k + 1];
00264 
00265                 for(int k = start_k; k < n; k++)
00266                 {
00267                     bool notlast = (k != n - 1);
00268 
00269                     SCALARTYPE p = buf[5 * k] * a_ik + buf[5 * k + 1] * a_ik_1;
00270 
00271                     if (notlast)
00272                     {
00273                         a_ik_2 = a_row[k + 2];
00274                         p = p + buf[5 * k + 2] * a_ik_2;
00275                         a_ik_2 = a_ik_2 - p * buf[5 * k + 4];
00276                     }
00277 
00278                     a_row[k] = a_ik - p;
00279                     a_ik_1 = a_ik_1 - p * buf[5 * k + 3];
00280 
00281                     a_ik = a_ik_1;
00282                     a_ik_1 = a_ik_2;
00283                 }
00284 
00285                 if(start_k < n)
00286                     a_row[n] = a_ik;
00287             }
00288         }
00289 
00291         template <typename SCALARTYPE>
00292         class FastMatrix
00293         {
00294         public:
00295             FastMatrix()
00296             {
00297                 size_ = 0;
00298             }
00299 
00300             FastMatrix(vcl_size_t sz, vcl_size_t internal_size) : size_(sz), internal_size_(internal_size)
00301             {
00302                 data.resize(internal_size * internal_size);
00303             }
00304 
00305             SCALARTYPE& operator()(int i, int j)
00306             {
00307                 return data[i * internal_size_ + j];
00308             }
00309 
00310             SCALARTYPE* row(int i)
00311             {
00312                 return &data[i * internal_size_];
00313             }
00314 
00315             SCALARTYPE* begin()
00316             {
00317                 return &data[0];
00318             }
00319 
00320             SCALARTYPE* end()
00321             {
00322                 return &data[0] + data.size();
00323             }
00324 
00325             std::vector<SCALARTYPE> data;
00326         private:
00327             vcl_size_t size_;
00328             vcl_size_t internal_size_;
00329         };
00330 
00331         // Nonsymmetric reduction from Hessenberg to real Schur form.
00332         // This is derived from the Algol procedure hqr2, by Martin and Wilkinson, Handbook for Auto. Comp.,
00333         // Vol.ii-Linear Algebra, and the corresponding  Fortran subroutine in EISPACK.
00334         template <typename SCALARTYPE, unsigned int ALIGNMENT>
00335         void hqr2(viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& vcl_H,
00336                     viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& V,
00337                     boost::numeric::ublas::vector<SCALARTYPE>& d,
00338                     boost::numeric::ublas::vector<SCALARTYPE>& e)
00339         {
00340             transpose(V);
00341 
00342             int nn = static_cast<int>(vcl_H.size1());
00343 
00344             FastMatrix<SCALARTYPE> H(nn, vcl_H.internal_size2());//, V(nn);
00345 
00346             std::vector<SCALARTYPE> buf(5 * nn);
00347             viennacl::vector<SCALARTYPE> buf_vcl(5 * nn);
00348 
00349             viennacl::fast_copy(vcl_H, H.begin());
00350 
00351 
00352             int n = nn - 1;
00353 
00354             SCALARTYPE eps = 2 * static_cast<SCALARTYPE>(EPS);
00355             SCALARTYPE exshift = 0;
00356             SCALARTYPE p = 0;
00357             SCALARTYPE q = 0;
00358             SCALARTYPE r = 0;
00359             SCALARTYPE s = 0;
00360             SCALARTYPE z = 0;
00361             SCALARTYPE t;
00362             SCALARTYPE w;
00363             SCALARTYPE x;
00364             SCALARTYPE y;
00365 
00366             SCALARTYPE out1, out2;
00367 
00368             // compute matrix norm
00369             SCALARTYPE norm = 0;
00370             for (int i = 0; i < nn; i++)
00371             {
00372                 for (int j = std::max(i - 1, 0); j < nn; j++)
00373                     norm = norm + std::fabs(H(i, j));
00374             }
00375 
00376             // Outer loop over eigenvalue index
00377             int iter = 0;
00378             while (n >= 0)
00379             {
00380                 // Look for single small sub-diagonal element
00381                 int l = n;
00382                 while (l > 0)
00383                 {
00384                     s = std::fabs(H(l - 1, l - 1)) + std::fabs(H(l, l));
00385                     if (s == 0) s = norm;
00386                     if (std::fabs(H(l, l - 1)) < eps * s)
00387                         break;
00388 
00389                     l--;
00390                 }
00391 
00392                 // Check for convergence
00393                 if (l == n)
00394                 {
00395                     // One root found
00396                     H(n, n) = H(n, n) + exshift;
00397                     d(n) = H(n, n);
00398                     e(n) = 0;
00399                     n--;
00400                     iter = 0;
00401                 }
00402                 else if (l == n - 1)
00403                 {
00404                     // Two roots found
00405                     w = H(n, n - 1) * H(n - 1, n);
00406                     p = (H(n - 1, n - 1) - H(n, n)) / 2;
00407                     q = p * p + w;
00408                     z = static_cast<SCALARTYPE>(std::sqrt(std::fabs(q)));
00409                     H(n, n) = H(n, n) + exshift;
00410                     H(n - 1, n - 1) = H(n - 1, n - 1) + exshift;
00411                     x = H(n, n);
00412 
00413                     if (q >= 0)
00414                     {
00415                         // Real pair
00416                         z = (p >= 0) ? (p + z) : (p - z);
00417                         d(n - 1) = x + z;
00418                         d(n) = d(n - 1);
00419                         if (z != 0)
00420                             d(n) = x - w / z;
00421                         e(n - 1) = 0;
00422                         e(n) = 0;
00423                         x = H(n, n - 1);
00424                         s = std::fabs(x) + std::fabs(z);
00425                         p = x / s;
00426                         q = z / s;
00427                         r = static_cast<SCALARTYPE>(std::sqrt(p * p + q * q));
00428                         p = p / r;
00429                         q = q / r;
00430 
00431                         // Row modification
00432                         for (int j = n - 1; j < nn; j++)
00433                         {
00434                             SCALARTYPE h_nj = H(n, j);
00435                             z = H(n - 1, j);
00436                             H(n - 1, j) = q * z + p * h_nj;
00437                             H(n, j) = q * h_nj - p * z;
00438                         }
00439 
00440                         final_iter_update(H, n, n + 1, q, p);
00441                         final_iter_update_gpu(V, n, nn, q, p);
00442                     }
00443                     else
00444                     {
00445                         // Complex pair
00446                         d(n - 1) = x + p;
00447                         d(n) = x + p;
00448                         e(n - 1) = z;
00449                         e(n) = -z;
00450                     }
00451 
00452                     n = n - 2;
00453                     iter = 0;
00454                 }
00455                 else
00456                 {
00457                     // No convergence yet
00458 
00459                     // Form shift
00460                     x = H(n, n);
00461                     y = 0;
00462                     w = 0;
00463                     if (l < n)
00464                     {
00465                         y = H(n - 1, n - 1);
00466                         w = H(n, n - 1) * H(n - 1, n);
00467                     }
00468 
00469                     // Wilkinson's original ad hoc shift
00470                     if (iter == 10)
00471                     {
00472                         exshift += x;
00473                         for (int i = 0; i <= n; i++)
00474                             H(i, i) -= x;
00475 
00476                         s = std::fabs(H(n, n - 1)) + std::fabs(H(n - 1, n - 2));
00477                         x = y = SCALARTYPE(0.75) * s;
00478                         w = SCALARTYPE(-0.4375) * s * s;
00479                     }
00480 
00481                     // MATLAB's new ad hoc shift
00482                     if (iter == 30)
00483                     {
00484                         s = (y - x) / 2;
00485                         s = s * s + w;
00486                         if (s > 0)
00487                         {
00488                             s = static_cast<SCALARTYPE>(std::sqrt(s));
00489                             if (y < x) s = -s;
00490                             s = x - w / ((y - x) / 2 + s);
00491                             for (int i = 0; i <= n; i++)
00492                                 H(i, i) -= s;
00493                             exshift += s;
00494                             x = y = w = SCALARTYPE(0.964);
00495                         }
00496                     }
00497 
00498                     iter = iter + 1;
00499 
00500                     // Look for two consecutive small sub-diagonal elements
00501                     int m = n - 2;
00502                     while (m >= l)
00503                     {
00504                         SCALARTYPE h_m1_m1 = H(m + 1, m + 1);
00505                         z = H(m, m);
00506                         r = x - z;
00507                         s = y - z;
00508                         p = (r * s - w) / H(m + 1, m) + H(m, m + 1);
00509                         q = h_m1_m1 - z - r - s;
00510                         r = H(m + 2, m + 1);
00511                         s = std::fabs(p) + std::fabs(q) + std::fabs(r);
00512                         p = p / s;
00513                         q = q / s;
00514                         r = r / s;
00515                         if (m == l)
00516                             break;
00517                         if (std::fabs(H(m, m - 1)) * (std::fabs(q) + std::fabs(r)) < eps * (std::fabs(p) * (std::fabs(H(m - 1, m - 1)) + std::fabs(z) + std::fabs(h_m1_m1))))
00518                             break;
00519                         m--;
00520                     }
00521 
00522                     for (int i = m + 2; i <= n; i++)
00523                     {
00524                         H(i, i - 2) = 0;
00525                         if (i > m + 2)
00526                             H(i, i - 3) = 0;
00527                     }
00528 
00529                     // float QR step involving rows l:n and columns m:n
00530                     for (int k = m; k < n; k++)
00531                     {
00532                         bool notlast = (k != n - 1);
00533                         if (k != m)
00534                         {
00535                             p = H(k, k - 1);
00536                             q = H(k + 1, k - 1);
00537                             r = (notlast ? H(k + 2, k - 1) : 0);
00538                             x = std::fabs(p) + std::fabs(q) + std::fabs(r);
00539                             if (x != 0)
00540                             {
00541                                 p = p / x;
00542                                 q = q / x;
00543                                 r = r / x;
00544                             }
00545                         }
00546 
00547                         if (x == 0) break;
00548 
00549                         s = static_cast<SCALARTYPE>(std::sqrt(p * p + q * q + r * r));
00550                         if (p < 0) s = -s;
00551 
00552                         if (s != 0)
00553                         {
00554                             if (k != m)
00555                                 H(k, k - 1) = -s * x;
00556                             else
00557                                 if (l != m)
00558                                     H(k, k - 1) = -H(k, k - 1);
00559 
00560                             p = p + s;
00561                             y = q / s;
00562                             z = r / s;
00563                             x = p / s;
00564                             q = q / p;
00565                             r = r / p;
00566 
00567                             buf[5 * k] = x;
00568                             buf[5 * k + 1] = y;
00569                             buf[5 * k + 2] = z;
00570                             buf[5 * k + 3] = q;
00571                             buf[5 * k + 4] = r;
00572 
00573 
00574                             SCALARTYPE* a_row_k = H.row(k);
00575                             SCALARTYPE* a_row_k_1 = H.row(k + 1);
00576                             SCALARTYPE* a_row_k_2 = H.row(k + 2);
00577                             // Row modification
00578                             for (int j = k; j < nn; j++)
00579                             {
00580                                 SCALARTYPE h_kj = a_row_k[j];
00581                                 SCALARTYPE h_k1_j = a_row_k_1[j];
00582 
00583                                 p = h_kj + q * h_k1_j;
00584                                 if (notlast)
00585                                 {
00586                                     SCALARTYPE h_k2_j = a_row_k_2[j];
00587                                     p = p + r * h_k2_j;
00588                                     a_row_k_2[j] = h_k2_j - p * z;
00589                                 }
00590 
00591                                 a_row_k[j] = h_kj - p * x;
00592                                 a_row_k_1[j] = h_k1_j - p * y;
00593                             }
00594 
00595                             //H(k + 1, nn - 1) = h_kj;
00596 
00597 
00598                             // Column modification
00599                             for (int i = k; i < std::min(nn, k + 4); i++)
00600                             {
00601                                 p = x * H(i, k) + y * H(i, k + 1);
00602                                 if (notlast)
00603                                 {
00604                                     p = p + z * H(i, k + 2);
00605                                     H(i, k + 2) = H(i, k + 2) - p * r;
00606                                 }
00607 
00608                                 H(i, k) = H(i, k) - p;
00609                                 H(i, k + 1) = H(i, k + 1) - p * q;
00610                             }
00611                         }
00612                         else
00613                         {
00614                             buf[5 * k] = 0;
00615                             buf[5 * k + 1] = 0;
00616                             buf[5 * k + 2] = 0;
00617                             buf[5 * k + 3] = 0;
00618                             buf[5 * k + 4] = 0;
00619                         }
00620                     }
00621 
00622                     // Timer timer;
00623                     // timer.start();
00624 
00625                     update_float_QR_column(H, buf, m, n, n, true);
00626                     update_float_QR_column_gpu(V, buf, buf_vcl, m, n, nn, false);
00627 
00628                     // std::cout << timer.get() << "\n";
00629                 }
00630             }
00631 
00632             // Backsubstitute to find vectors of upper triangular form
00633             if (norm == 0)
00634             {
00635                 return;
00636             }
00637 
00638             for (n = nn - 1; n >= 0; n--)
00639             {
00640                 p = d(n);
00641                 q = e(n);
00642 
00643                 // Real vector
00644                 if (q == 0)
00645                 {
00646                     int l = n;
00647                     H(n, n) = 1;
00648                     for (int i = n - 1; i >= 0; i--)
00649                     {
00650                         w = H(i, i) - p;
00651                         r = 0;
00652                         for (int j = l; j <= n; j++)
00653                             r = r + H(i, j) * H(j, n);
00654 
00655                         if (e(i) < 0)
00656                         {
00657                             z = w;
00658                             s = r;
00659                         }
00660                         else
00661                         {
00662                             l = i;
00663                             if (e(i) == 0)
00664                             {
00665                                 H(i, n) = (w != 0) ? (-r / w) : (-r / (eps * norm));
00666                             }
00667                             else
00668                             {
00669                                 // Solve real equations
00670                                 x = H(i, i + 1);
00671                                 y = H(i + 1, i);
00672                                 q = (d(i) - p) * (d(i) - p) + e(i) * e(i);
00673                                 t = (x * s - z * r) / q;
00674                                 H(i, n) = t;
00675                                 H(i + 1, n) = (std::fabs(x) > std::fabs(z)) ? ((-r - w * t) / x) : ((-s - y * t) / z);
00676                             }
00677 
00678                             // Overflow control
00679                             t = std::fabs(H(i, n));
00680                             if ((eps * t) * t > 1)
00681                                 for (int j = i; j <= n; j++)
00682                                     H(j, n) /= t;
00683                         }
00684                     }
00685                 }
00686                 else if (q < 0)
00687                 {
00688                     // Complex vector
00689                     int l = n - 1;
00690 
00691                     // Last vector component imaginary so matrix is triangular
00692                     if (std::fabs(H(n, n - 1)) > std::fabs(H(n - 1, n)))
00693                     {
00694                         H(n - 1, n - 1) = q / H(n, n - 1);
00695                         H(n - 1, n) = -(H(n, n) - p) / H(n, n - 1);
00696                     }
00697                     else
00698                     {
00699                         cdiv<SCALARTYPE>(0, -H(n - 1, n), H(n - 1, n - 1) - p, q, out1, out2);
00700 
00701                         H(n - 1, n - 1) = out1;
00702                         H(n - 1, n) = out2;
00703                     }
00704 
00705                     H(n, n - 1) = 0;
00706                     H(n, n) = 1;
00707                     for (int i = n - 2; i >= 0; i--)
00708                     {
00709                         SCALARTYPE ra, sa, vr, vi;
00710                         ra = 0;
00711                         sa = 0;
00712                         for (int j = l; j <= n; j++)
00713                         {
00714                             SCALARTYPE h_ij = H(i, j);
00715                             ra = ra + h_ij * H(j, n - 1);
00716                             sa = sa + h_ij * H(j, n);
00717                         }
00718 
00719                         w = H(i, i) - p;
00720 
00721                         if (e(i) < 0)
00722                         {
00723                             z = w;
00724                             r = ra;
00725                             s = sa;
00726                         }
00727                         else
00728                         {
00729                             l = i;
00730                             if (e(i) == 0)
00731                             {
00732                                 cdiv<SCALARTYPE>(-ra, -sa, w, q, out1, out2);
00733                                 H(i, n - 1) = out1;
00734                                 H(i, n) = out2;
00735                             }
00736                             else
00737                             {
00738                                 // Solve complex equations
00739                                 x = H(i, i + 1);
00740                                 y = H(i + 1, i);
00741                                 vr = (d(i) - p) * (d(i) - p) + e(i) * e(i) - q * q;
00742                                 vi = (d(i) - p) * 2 * q;
00743                                 if ( (vr == 0) && (vi == 0) )
00744                                     vr = eps * norm * (std::fabs(w) + std::fabs(q) + std::fabs(x) + std::fabs(y) + std::fabs(z));
00745 
00746                                 cdiv<SCALARTYPE>(x * r - z * ra + q * sa, x * s - z * sa - q * ra, vr, vi, out1, out2);
00747 
00748                                 H(i, n - 1) = out1;
00749                                 H(i, n) = out2;
00750 
00751 
00752                                 if (std::fabs(x) > (std::fabs(z) + std::fabs(q)))
00753                                 {
00754                                     H(i + 1, n - 1) = (-ra - w * H(i, n - 1) + q * H(i, n)) / x;
00755                                     H(i + 1, n) = (-sa - w * H(i, n) - q * H(i, n - 1)) / x;
00756                                 }
00757                                 else
00758                                 {
00759                                     cdiv<SCALARTYPE>(-r - y * H(i, n - 1), -s - y * H(i, n), z, q, out1, out2);
00760 
00761                                     H(i + 1, n - 1) = out1;
00762                                     H(i + 1, n) = out2;
00763                                 }
00764                             }
00765 
00766                             // Overflow control
00767                             t = std::max(std::fabs(H(i, n - 1)), std::fabs(H(i, n)));
00768                             if ((eps * t) * t > 1)
00769                             {
00770                                 for (int j = i; j <= n; j++)
00771                                 {
00772                                     H(j, n - 1) /= t;
00773                                     H(j, n) /= t;
00774                                 }
00775                             }
00776                         }
00777                     }
00778                 }
00779             }
00780 
00781             viennacl::fast_copy(H.begin(), H.end(),  vcl_H);
00782             // viennacl::fast_copy(V.begin(), V.end(),  vcl_V);
00783 
00784             viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT> tmp = V;
00785 
00786             V = viennacl::linalg::prod(trans(tmp), vcl_H);
00787         }
00788 
00789         template <typename SCALARTYPE, unsigned int ALIGNMENT>
00790         bool householder_twoside(
00791                             viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& A,
00792                             viennacl::matrix<SCALARTYPE, row_major, ALIGNMENT>& Q,
00793                             viennacl::vector<SCALARTYPE, ALIGNMENT>& D,
00794                             vcl_size_t start)
00795         {
00796             viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00797 
00798             if(start + 2 >= A.size1())
00799                 return false;
00800 
00801             prepare_householder_vector(A, D, A.size1(), start + 1, start, start + 1, true);
00802 
00803             {
00804                 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_LEFT_KERNEL);
00805 
00806                 viennacl::ocl::enqueue(kernel(
00807                                               A,
00808                                               D,
00809                                               static_cast<cl_uint>(start + 1),
00810                                               static_cast<cl_uint>(start),
00811                                               static_cast<cl_uint>(A.size1()),
00812                                               static_cast<cl_uint>(A.size2()),
00813                                               static_cast<cl_uint>(A.internal_size2()),
00814                                               viennacl::ocl::local_mem(static_cast<cl_uint>(128 * 4))
00815                                       ));
00816             }
00817 
00818             {
00819                 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_A_RIGHT_KERNEL);
00820 
00821                 viennacl::ocl::enqueue(kernel(
00822                                               A,
00823                                               D,
00824                                               static_cast<cl_uint>(0),
00825                                               static_cast<cl_uint>(0),
00826                                               static_cast<cl_uint>(A.size1()),
00827                                               static_cast<cl_uint>(A.size2()),
00828                                               static_cast<cl_uint>(A.internal_size2()),
00829                                               viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
00830                                       ));
00831             }
00832 
00833             {
00834                 viennacl::ocl::kernel& kernel = ctx.get_kernel(viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::program_name(), SVD_HOUSEHOLDER_UPDATE_QL_KERNEL);
00835 
00836                 viennacl::ocl::enqueue(kernel(
00837                                                 Q,
00838                                                 D,
00839                                                 static_cast<cl_uint>(A.size1()),
00840                                                 static_cast<cl_uint>(A.size2()),
00841                                                 static_cast<cl_uint>(Q.internal_size2()),
00842                                                 viennacl::ocl::local_mem(static_cast<cl_uint>(128 * sizeof(SCALARTYPE)))
00843                                             ));
00844             }
00845 
00846             return true;
00847         }
00848 
00849         template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
00850         void tridiagonal_reduction(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A,
00851                                     viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q)
00852         {
00853             vcl_size_t sz = A.size1();
00854 
00855             viennacl::vector<SCALARTYPE> hh_vector(sz);
00856 
00857             for(vcl_size_t i = 0; i < sz; i++)
00858             {
00859                 householder_twoside(A, Q, hh_vector, i);
00860             }
00861 
00862         }
00863 
00864         template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
00865         void qr_method(viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & A,
00866                        viennacl::matrix<SCALARTYPE, F, ALIGNMENT> & Q,
00867                        boost::numeric::ublas::vector<SCALARTYPE> & D,
00868                        boost::numeric::ublas::vector<SCALARTYPE> & E,
00869                        bool is_symmetric = true)
00870         {
00871             viennacl::ocl::context & ctx = const_cast<viennacl::ocl::context &>(viennacl::traits::opencl_handle(A).context());
00872 
00873             assert(A.size1() == A.size2() && bool("Input matrix must be square for QR method!"));
00874 
00875             D.resize(A.size1());
00876             E.resize(A.size1());
00877 
00878             viennacl::linalg::opencl::kernels::svd<SCALARTYPE>::init(ctx);
00879 
00880             Q = viennacl::identity_matrix<SCALARTYPE>(Q.size1(), ctx);
00881 
00882             // reduce to tridiagonal form
00883             detail::tridiagonal_reduction(A, Q);
00884 
00885             // pack diagonal and super-diagonal
00886             // ublas::vector<SCALARTYPE> D(A.size1()), E(A.size1());
00887 
00888             bidiag_pack(A, D, E);
00889 
00890             // find eigenvalues
00891             if(is_symmetric)
00892             {
00893 
00894                 detail::tql2(Q, D, E);
00895                 transpose(Q);
00896             }
00897             else
00898             {
00899                 detail::hqr2(A, Q, D, E);
00900             }
00901 
00902             // std::cout << A << "\n";
00903 
00904             boost::numeric::ublas::matrix<float> eigen_values(A.size1(), A.size1());
00905             eigen_values.clear();
00906 
00907             for (vcl_size_t i = 0; i < A.size1(); i++)
00908             {
00909                 if(std::fabs(E(i)) < EPS)
00910                 {
00911                     eigen_values(i, i) = D(i);
00912                 }
00913                 else
00914                 {
00915                     eigen_values(i, i) = D(i);
00916                     eigen_values(i, i + 1) = E(i);
00917                     eigen_values(i + 1, i) = -E(i);
00918                     eigen_values(i + 1, i + 1) = D(i);
00919                     i++;
00920                 }
00921             }
00922 
00923             copy(eigen_values, A);
00924         }
00925     }
00926 
00927 
00928     template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
00929     void qr_method_nsm(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A,
00930                        viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q,
00931                        boost::numeric::ublas::vector<SCALARTYPE>& D,
00932                        boost::numeric::ublas::vector<SCALARTYPE>& E
00933                       )
00934     {
00935         detail::qr_method(A, Q, D, E, false);
00936     }
00937 
00938     template <typename SCALARTYPE, typename F, unsigned int ALIGNMENT>
00939     void qr_method_sym(viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& A,
00940                        viennacl::matrix<SCALARTYPE, F, ALIGNMENT>& Q,
00941                        boost::numeric::ublas::vector<SCALARTYPE>& D
00942                       )
00943     {
00944         boost::numeric::ublas::vector<SCALARTYPE> E(A.size1());
00945 
00946         detail::qr_method(A, Q, D, E, true);
00947     }
00948 
00949   }
00950 }
00951 
00952 #endif