Actual source code: fnexp.c
slepc-3.13.1 2020-04-12
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2020, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Exponential function exp(x)
12: */
14: #include <slepc/private/fnimpl.h> /*I "slepcfn.h" I*/
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Exp(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: *y = PetscExpScalar(x);
21: return(0);
22: }
24: PetscErrorCode FNEvaluateDerivative_Exp(FN fn,PetscScalar x,PetscScalar *y)
25: {
27: *y = PetscExpScalar(x);
28: return(0);
29: }
31: #define MAX_PADE 6
32: #define SWAP(a,b,t) {t=a;a=b;b=t;}
34: PetscErrorCode FNEvaluateFunctionMat_Exp_Pade(FN fn,Mat A,Mat B)
35: {
37: PetscBLASInt n,ld,ld2,*ipiv,info,inc=1;
38: PetscInt m,j,k,sexp;
39: PetscBool odd;
40: const PetscInt p=MAX_PADE;
41: PetscReal c[MAX_PADE+1],s,*rwork;
42: PetscScalar scale,mone=-1.0,one=1.0,two=2.0,zero=0.0;
43: PetscScalar *Aa,*Ba,*As,*A2,*Q,*P,*W,*aux;
46: MatDenseGetArray(A,&Aa);
47: MatDenseGetArray(B,&Ba);
48: MatGetSize(A,&m,NULL);
49: PetscBLASIntCast(m,&n);
50: ld = n;
51: ld2 = ld*ld;
52: P = Ba;
53: PetscMalloc6(m*m,&Q,m*m,&W,m*m,&As,m*m,&A2,ld,&rwork,ld,&ipiv);
54: PetscArraycpy(As,Aa,ld2);
56: /* Pade' coefficients */
57: c[0] = 1.0;
58: for (k=1;k<=p;k++) c[k] = c[k-1]*(p+1-k)/(k*(2*p+1-k));
60: /* Scaling */
61: s = LAPACKlange_("I",&n,&n,As,&ld,rwork);
62: PetscLogFlops(1.0*n*n);
63: if (s>0.5) {
64: sexp = PetscMax(0,(int)(PetscLogReal(s)/PetscLogReal(2.0))+2);
65: scale = PetscPowRealInt(2.0,-sexp);
66: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&scale,As,&inc));
67: PetscLogFlops(1.0*n*n);
68: } else sexp = 0;
70: /* Horner evaluation */
71: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,As,&ld,As,&ld,&zero,A2,&ld));
72: PetscLogFlops(2.0*n*n*n);
73: PetscArrayzero(Q,ld2);
74: PetscArrayzero(P,ld2);
75: for (j=0;j<n;j++) {
76: Q[j+j*ld] = c[p];
77: P[j+j*ld] = c[p-1];
78: }
80: odd = PETSC_TRUE;
81: for (k=p-1;k>0;k--) {
82: if (odd) {
83: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,A2,&ld,&zero,W,&ld));
84: SWAP(Q,W,aux);
85: for (j=0;j<n;j++) Q[j+j*ld] += c[k-1];
86: odd = PETSC_FALSE;
87: } else {
88: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,A2,&ld,&zero,W,&ld));
89: SWAP(P,W,aux);
90: for (j=0;j<n;j++) P[j+j*ld] += c[k-1];
91: odd = PETSC_TRUE;
92: }
93: PetscLogFlops(2.0*n*n*n);
94: }
95: /*if (odd) {
96: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,Q,&ld,As,&ld,&zero,W,&ld));
97: SWAP(Q,W,aux);
98: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
99: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
100: SlepcCheckLapackInfo("gesv",info);
101: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
102: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
103: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&mone,P,&inc));
104: } else {*/
105: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,As,&ld,&zero,W,&ld));
106: SWAP(P,W,aux);
107: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&ld2,&mone,P,&inc,Q,&inc));
108: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n,&n,Q,&ld,ipiv,P,&ld,&info));
109: SlepcCheckLapackInfo("gesv",info);
110: PetscStackCallBLAS("BLASscal",BLASscal_(&ld2,&two,P,&inc));
111: for (j=0;j<n;j++) P[j+j*ld] += 1.0;
112: /*}*/
113: PetscLogFlops(2.0*n*n*n+2.0*n*n*n/3.0+4.0*n*n);
115: for (k=1;k<=sexp;k++) {
116: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&one,P,&ld,P,&ld,&zero,W,&ld));
117: PetscArraycpy(P,W,ld2);
118: }
119: if (P!=Ba) { PetscArraycpy(Ba,P,ld2); }
120: PetscLogFlops(2.0*n*n*n*sexp);
122: PetscFree6(Q,W,As,A2,rwork,ipiv);
123: MatDenseRestoreArray(A,&Aa);
124: MatDenseRestoreArray(B,&Ba);
125: return(0);
126: }
128: /*
129: * Set scaling factor (s) and Pade degree (k,m)
130: */
131: static PetscErrorCode sexpm_params(PetscReal nrm,PetscInt *s,PetscInt *k,PetscInt *m)
132: {
134: if (nrm>1) {
135: if (nrm<200) {*s = 4; *k = 5; *m = *k-1;}
136: else if (nrm<1e4) {*s = 4; *k = 4; *m = *k+1;}
137: else if (nrm<1e6) {*s = 4; *k = 3; *m = *k+1;}
138: else if (nrm<1e9) {*s = 3; *k = 3; *m = *k+1;}
139: else if (nrm<1e11) {*s = 2; *k = 3; *m = *k+1;}
140: else if (nrm<1e12) {*s = 2; *k = 2; *m = *k+1;}
141: else if (nrm<1e14) {*s = 2; *k = 1; *m = *k+1;}
142: else {*s = 1; *k = 1; *m = *k+1;}
143: } else { /* nrm<1 */
144: if (nrm>0.5) {*s = 4; *k = 4; *m = *k-1;}
145: else if (nrm>0.3) {*s = 3; *k = 4; *m = *k-1;}
146: else if (nrm>0.15) {*s = 2; *k = 4; *m = *k-1;}
147: else if (nrm>0.07) {*s = 1; *k = 4; *m = *k-1;}
148: else if (nrm>0.01) {*s = 0; *k = 4; *m = *k-1;}
149: else if (nrm>3e-4) {*s = 0; *k = 3; *m = *k-1;}
150: else if (nrm>1e-5) {*s = 0; *k = 3; *m = 0;}
151: else if (nrm>1e-8) {*s = 0; *k = 2; *m = 0;}
152: else {*s = 0; *k = 1; *m = 0;}
153: }
154: return(0);
155: }
157: #if defined(PETSC_HAVE_COMPLEX)
158: /*
159: * Partial fraction form coefficients.
160: * If query, the function returns the size necessary to store the coefficients.
161: */
162: static PetscErrorCode getcoeffs(PetscInt k,PetscInt m,PetscComplex *r,PetscComplex *q,PetscComplex *remain,PetscBool query)
163: {
164: PetscInt i;
165: const PetscComplex /* m == k+1 */
166: p1r4[5] = {-1.582680186458572e+01 - 2.412564578224361e+01*PETSC_i,
167: -1.582680186458572e+01 + 2.412564578224361e+01*PETSC_i,
168: 1.499984465975511e+02 + 6.804227952202417e+01*PETSC_i,
169: 1.499984465975511e+02 - 6.804227952202417e+01*PETSC_i,
170: -2.733432894659307e+02 },
171: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
172: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
173: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
174: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i,
175: 6.286704751729261e+00 },
176: p1r3[4] = {-1.130153999597152e+01 + 1.247167585025031e+01*PETSC_i,
177: -1.130153999597152e+01 - 1.247167585025031e+01*PETSC_i,
178: 1.330153999597152e+01 - 6.007173273704750e+01*PETSC_i,
179: 1.330153999597152e+01 + 6.007173273704750e+01*PETSC_i},
180: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
181: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
182: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
183: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
184: p1r2[3] = { 7.648749087422928e+00 + 4.171640244747463e+00*PETSC_i,
185: 7.648749087422928e+00 - 4.171640244747463e+00*PETSC_i,
186: -1.829749817484586e+01 },
187: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
188: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
189: 3.637834252744491e+00 },
190: p1r1[2] = { 1.000000000000000e+00 - 3.535533905932738e+00*PETSC_i,
191: 1.000000000000000e+00 + 3.535533905932738e+00*PETSC_i},
192: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
193: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
194: const PetscComplex /* m == k-1 */
195: m1r5[4] = {-1.423367961376821e+02 - 1.385465094833037e+01*PETSC_i,
196: -1.423367961376821e+02 + 1.385465094833037e+01*PETSC_i,
197: 2.647367961376822e+02 - 4.814394493714596e+02*PETSC_i,
198: 2.647367961376822e+02 + 4.814394493714596e+02*PETSC_i},
199: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
200: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
201: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
202: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
203: m1r4[3] = { 2.484269593165883e+01 + 7.460342395992306e+01*PETSC_i,
204: 2.484269593165883e+01 - 7.460342395992306e+01*PETSC_i,
205: -1.734353918633177e+02 },
206: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
207: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
208: 5.648485971016893e+00 },
209: m1r3[2] = { 2.533333333333333e+01 - 2.733333333333333e+01*PETSC_i,
210: 2.533333333333333e+01 + 2.733333333333333e+01*PETSC_i},
211: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
212: 4.000000000000000e+00 - 2.000000000000000e+00*PETSC_i};
213: const PetscScalar /* m == k-1 */
214: m1remain5[2] = { 2.000000000000000e-01, 9.800000000000000e+00},
215: m1remain4[2] = {-2.500000000000000e-01, -7.750000000000000e+00},
216: m1remain3[2] = { 3.333333333333333e-01, 5.666666666666667e+00},
217: m1remain2[2] = {-0.5, -3.5},
218: remain3[4] = {1.0/6.0, 1.0/2.0, 1, 1},
219: remain2[3] = {1.0/2.0, 1, 1};
222: if (query) { /* query about buffer's size */
223: if (m==k+1) {
224: *remain = 0;
225: *r = *q = k+1;
226: return(0); /* quick return */
227: }
228: if (m==k-1) {
229: *remain = 2;
230: if (k==5) *r = *q = 4;
231: else if (k==4) *r = *q = 3;
232: else if (k==3) *r = *q = 2;
233: else if (k==2) *r = *q = 1;
234: }
235: if (m==0) {
236: *r = *q = 0;
237: *remain = k+1;
238: }
239: } else {
240: if (m==k+1) {
241: if (k==4) {
242: for (i=0;i<5;i++) { r[i] = p1r4[i]; q[i] = p1q4[i]; }
243: } else if (k==3) {
244: for (i=0;i<4;i++) { r[i] = p1r3[i]; q[i] = p1q3[i]; }
245: } else if (k==2) {
246: for (i=0;i<3;i++) { r[i] = p1r2[i]; q[i] = p1q2[i]; }
247: } else if (k==1) {
248: for (i=0;i<2;i++) { r[i] = p1r1[i]; q[i] = p1q1[i]; }
249: }
250: return(0); /* quick return */
251: }
252: if (m==k-1) {
253: if (k==5) {
254: for (i=0;i<4;i++) { r[i] = m1r5[i]; q[i] = m1q5[i]; }
255: for (i=0;i<2;i++) remain[i] = m1remain5[i];
256: } else if (k==4) {
257: for (i=0;i<3;i++) { r[i] = m1r4[i]; q[i] = m1q4[i]; }
258: for (i=0;i<2;i++) remain[i] = m1remain4[i];
259: } else if (k==3) {
260: for (i=0;i<2;i++) { r[i] = m1r3[i]; q[i] = m1q3[i]; remain[i] = m1remain3[i]; }
261: } else if (k==2) {
262: r[0] = -13.5; q[0] = 3;
263: for (i=0;i<2;i++) remain[i] = m1remain2[i];
264: }
265: }
266: if (m==0) {
267: r = q = 0;
268: if (k==3) {
269: for (i=0;i<4;i++) remain[i] = remain3[i];
270: } else if (k==2) {
271: for (i=0;i<3;i++) remain[i] = remain2[i];
272: }
273: }
274: }
275: return(0);
276: }
278: /*
279: * Product form coefficients.
280: * If query, the function returns the size necessary to store the coefficients.
281: */
282: static PetscErrorCode getcoeffsproduct(PetscInt k,PetscInt m,PetscComplex *p,PetscComplex *q,PetscComplex *mult,PetscBool query)
283: {
284: PetscInt i;
285: const PetscComplex /* m == k+1 */
286: p1p4[4] = {-5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
287: -5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
288: -6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
289: -6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
290: p1q4[5] = { 3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
291: 3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
292: 6.286704751729261e+00 ,
293: 5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
294: 5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
295: p1p3[3] = {-4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
296: -4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
297: -5.648485971016893e+00 },
298: p1q3[4] = { 3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
299: 3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
300: 4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
301: 4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
302: p1p2[2] = {-4.00000000000000e+00 + 2.000000000000000e+00*PETSC_i,
303: -4.00000000000000e+00 - 2.000000000000000e+00*PETSC_i},
304: p1q2[3] = { 2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
305: 2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
306: 3.637834252744491e+00 },
307: p1q1[2] = { 2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
308: 2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
309: const PetscComplex /* m == k-1 */
310: m1p5[5] = {-3.655694325463550e+00 + 6.543736899360086e+00*PETSC_i,
311: -3.655694325463550e+00 - 6.543736899360086e+00*PETSC_i,
312: -6.286704751729261e+00 ,
313: -5.700953298671832e+00 + 3.210265600308496e+00*PETSC_i,
314: -5.700953298671832e+00 - 3.210265600308496e+00*PETSC_i},
315: m1q5[4] = { 5.203941240131764e+00 + 5.805856841805367e+00*PETSC_i,
316: 5.203941240131764e+00 - 5.805856841805367e+00*PETSC_i,
317: 6.796058759868242e+00 + 1.886649260140217e+00*PETSC_i,
318: 6.796058759868242e+00 - 1.886649260140217e+00*PETSC_i},
319: m1p4[4] = {-3.212806896871536e+00 + 4.773087433276636e+00*PETSC_i,
320: -3.212806896871536e+00 - 4.773087433276636e+00*PETSC_i,
321: -4.787193103128464e+00 + 1.567476416895212e+00*PETSC_i,
322: -4.787193103128464e+00 - 1.567476416895212e+00*PETSC_i},
323: m1q4[3] = { 4.675757014491557e+00 + 3.913489560603711e+00*PETSC_i,
324: 4.675757014491557e+00 - 3.913489560603711e+00*PETSC_i,
325: 5.648485971016893e+00 },
326: m1p3[3] = {-2.681082873627756e+00 + 3.050430199247411e+00*PETSC_i,
327: -2.681082873627756e+00 - 3.050430199247411e+00*PETSC_i,
328: -3.637834252744491e+00 },
329: m1q3[2] = { 4.000000000000000e+00 + 2.000000000000000e+00*PETSC_i,
330: 4.000000000000000e+00 - 2.000000000000001e+00*PETSC_i},
331: m1p2[2] = {-2.000000000000000e+00 + 1.414213562373095e+00*PETSC_i,
332: -2.000000000000000e+00 - 1.414213562373095e+00*PETSC_i};
335: if (query) {
336: if (m == k+1) {
337: *mult = 1;
338: *p = k;
339: *q = k+1;
340: return(0);
341: }
342: if (m==k-1) {
343: *mult = 1;
344: *p = k;
345: *q = k-1;
346: }
347: } else {
348: if (m == k+1) {
349: *mult = PetscPowInt(-1,m);
350: *mult *= m;
351: if (k==4) {
352: for (i=0;i<4;i++) { p[i] = p1p4[i]; q[i] = p1q4[i]; }
353: q[4] = p1q4[4];
354: } else if (k==3) {
355: for (i=0;i<3;i++) { p[i] = p1p3[i]; q[i] = p1q3[i]; }
356: q[3] = p1q3[3];
357: } else if (k==2) {
358: for (i=0;i<2;i++) { p[i] = p1p2[i]; q[i] = p1q2[i]; }
359: q[2] = p1q2[2];
360: } else if (k==1) {
361: p[0] = -3;
362: for (i=0;i<2;i++) q[i] = p1q1[i];
363: }
364: return(0);
365: }
366: if (m==k-1) {
367: *mult = PetscPowInt(-1,m);
368: *mult /= k;
369: if (k==5) {
370: for (i=0;i<4;i++) { p[i] = m1p5[i]; q[i] = m1q5[i]; }
371: p[4] = m1p5[4];
372: } else if (k==4) {
373: for (i=0;i<3;i++) { p[i] = m1p4[i]; q[i] = m1q4[i]; }
374: p[3] = m1p4[3];
375: } else if (k==3) {
376: for (i=0;i<2;i++) { p[i] = m1p3[i]; q[i] = m1q3[i]; }
377: p[2] = m1p3[2];
378: } else if (k==2) {
379: for (i=0;i<2;i++) p[i] = m1p2[i];
380: q[0] = 3;
381: }
382: }
383: }
384: return(0);
385: }
386: #endif /* PETSC_HAVE_COMPLEX */
388: #if defined(PETSC_USE_COMPLEX)
389: static PetscErrorCode getisreal(PetscInt n,PetscComplex *a,PetscBool *result)
390: {
391: PetscInt i;
394: *result=PETSC_TRUE;
395: for (i=0;i<n&&*result;i++) {
396: if (PetscImaginaryPartComplex(a[i])) *result=PETSC_FALSE;
397: }
398: return(0);
399: }
400: #endif
402: /*
403: * Matrix exponential implementation based on algorithm and matlab code by Stefan Guettel
404: * and Yuji Nakatsukasa
405: *
406: * Stefan Guettel and Yuji Nakatsukasa, "Scaled and Squared Subdiagonal Pade
407: * Approximation for the Matrix Exponential",
408: * SIAM J. Matrix Anal. Appl. 37(1):145-170, 2016.
409: * https://doi.org/10.1137/15M1027553
410: */
411: PetscErrorCode FNEvaluateFunctionMat_Exp_GuettelNakatsukasa(FN fn,Mat A,Mat B)
412: {
413: #if !defined(PETSC_HAVE_COMPLEX)
415: SETERRQ(PETSC_COMM_SELF,1,"This function requires C99 or C++ complex support");
416: #else
417: PetscInt i,j,n_,s,k,m,mod;
418: PetscBLASInt n,n2,irsize,rsizediv2,ipsize,iremainsize,info,*piv,minlen,lwork,one=1;
419: PetscReal nrm,shift;
420: #if defined(PETSC_USE_COMPLEX) || defined(PETSC_HAVE_ESSL)
421: PetscReal *rwork=NULL;
422: #endif
423: PetscComplex *As,*RR,*RR2,*expmA,*expmA2,*Maux,*Maux2,rsize,*r,psize,*p,remainsize,*remainterm,*rootp,*rootq,mult=0.0,scale,cone=1.0,czero=0.0,*aux;
424: PetscScalar *Aa,*Ba,*Ba2,*sMaux,*wr,*wi,expshift,sone=1.0,szero=0.0,*saux;
426: PetscBool isreal;
427: #if defined(PETSC_HAVE_ESSL)
428: PetscScalar sdummy,*wri;
429: PetscBLASInt idummy,io=0;
430: #else
431: PetscBLASInt query=-1;
432: PetscScalar work1,*work;
433: #endif
436: MatGetSize(A,&n_,NULL);
437: PetscBLASIntCast(n_,&n);
438: MatDenseGetArray(A,&Aa);
439: MatDenseGetArray(B,&Ba);
440: Ba2 = Ba;
441: PetscBLASIntCast(n*n,&n2);
443: PetscMalloc2(n2,&sMaux,n2,&Maux);
444: Maux2 = Maux;
445: PetscMalloc2(n,&wr,n,&wi);
446: PetscArraycpy(sMaux,Aa,n2);
447: /* estimate rightmost eigenvalue and shift A with it */
448: #if !defined(PETSC_HAVE_ESSL)
449: #if !defined(PETSC_USE_COMPLEX)
450: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,&work1,&query,&info));
451: SlepcCheckLapackInfo("geev",info);
452: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
453: PetscMalloc1(lwork,&work);
454: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,sMaux,&n,wr,wi,NULL,&n,NULL,&n,work,&lwork,&info));
455: PetscFree(work);
456: #else
457: PetscArraycpy(Maux,Aa,n2);
458: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,&work1,&query,rwork,&info));
459: SlepcCheckLapackInfo("geev",info);
460: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
461: PetscMalloc2(2*n,&rwork,lwork,&work);
462: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_("N","N",&n,Maux,&n,wr,NULL,&n,NULL,&n,work,&lwork,rwork,&info));
463: PetscFree2(rwork,work);
464: #endif
465: SlepcCheckLapackInfo("geev",info);
466: #else /* defined(PETSC_HAVE_ESSL) */
467: PetscBLASIntCast(4*n,&lwork);
468: PetscMalloc2(lwork,&rwork,2*n,&wri);
469: #if !defined(PETSC_USE_COMPLEX)
470: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,sMaux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
471: for (i=0;i<n;i++) {
472: wr[i] = wri[2*i];
473: wi[i] = wri[2*i+1];
474: }
475: #else
476: PetscStackCallBLAS("LAPACKgeev",LAPACKgeev_(&io,Maux,&n,wri,&sdummy,&idummy,&idummy,&n,rwork,&lwork));
477: for (i=0;i<n;i++) wr[i] = wri[i];
478: #endif
479: PetscFree2(rwork,wri);
480: #endif
481: PetscLogFlops(25.0*n*n*n+(n*n*n)/3.0+1.0*n*n*n);
483: shift = PetscRealPart(wr[0]);
484: for (i=1;i<n;i++) {
485: if (PetscRealPart(wr[i]) > shift) shift = PetscRealPart(wr[i]);
486: }
487: PetscFree2(wr,wi);
488: /* shift so that largest real part is (about) 0 */
489: PetscArraycpy(sMaux,Aa,n2);
490: for (i=0;i<n;i++) {
491: sMaux[i+i*n] -= shift;
492: }
493: PetscLogFlops(1.0*n);
494: #if defined(PETSC_USE_COMPLEX)
495: PetscArraycpy(Maux,Aa,n2);
496: for (i=0;i<n;i++) {
497: Maux[i+i*n] -= shift;
498: }
499: PetscLogFlops(1.0*n);
500: #endif
502: /* estimate norm(A) and select the scaling factor */
503: nrm = LAPACKlange_("O",&n,&n,sMaux,&n,NULL);
504: PetscLogFlops(1.0*n*n);
505: sexpm_params(nrm,&s,&k,&m);
506: if (s==0 && k==1 && m==0) { /* exp(A) = I+A to eps! */
507: expshift = PetscExpReal(shift);
508: for (i=0;i<n;i++) sMaux[i+i*n] += 1.0;
509: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,sMaux,&one));
510: PetscLogFlops(1.0*(n+n2));
511: PetscArraycpy(Ba,sMaux,n2);
512: PetscFree2(sMaux,Maux);
513: MatDenseRestoreArray(A,&Aa);
514: MatDenseRestoreArray(B,&Ba);
515: return(0); /* quick return */
516: }
518: PetscMalloc4(n2,&expmA,n2,&As,n2,&RR,n,&piv);
519: expmA2 = expmA; RR2 = RR;
520: /* scale matrix */
521: #if !defined(PETSC_USE_COMPLEX)
522: for (i=0;i<n2;i++) {
523: As[i] = sMaux[i];
524: }
525: #else
526: PetscArraycpy(As,sMaux,n2);
527: #endif
528: scale = 1.0/PetscPowRealInt(2.0,s);
529: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&scale,As,&one));
530: SlepcLogFlopsComplex(1.0*n2);
532: /* evaluate Pade approximant (partial fraction or product form) */
533: if (fn->method==3 || !m) { /* partial fraction */
534: getcoeffs(k,m,&rsize,&psize,&remainsize,PETSC_TRUE);
535: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
536: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
537: PetscBLASIntCast((PetscInt)PetscRealPartComplex(remainsize),&iremainsize);
538: PetscMalloc3(irsize,&r,ipsize,&p,iremainsize,&remainterm);
539: getcoeffs(k,m,r,p,remainterm,PETSC_FALSE);
541: PetscArrayzero(expmA,n2);
542: #if !defined(PETSC_USE_COMPLEX)
543: isreal = PETSC_TRUE;
544: #else
545: getisreal(n2,Maux,&isreal);
546: #endif
547: if (isreal) {
548: rsizediv2 = irsize/2;
549: for (i=0;i<rsizediv2;i++) { /* use partial fraction to get R(As) */
550: PetscArraycpy(Maux,As,n2);
551: PetscArrayzero(RR,n2);
552: for (j=0;j<n;j++) {
553: Maux[j+j*n] -= p[2*i];
554: RR[j+j*n] = r[2*i];
555: }
556: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
557: SlepcCheckLapackInfo("gesv",info);
558: for (j=0;j<n2;j++) {
559: expmA[j] += RR[j] + PetscConj(RR[j]);
560: }
561: /* loop(n) + gesv + loop(n2) */
562: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+2.0*n2);
563: }
565: mod = ipsize % 2;
566: if (mod) {
567: PetscArraycpy(Maux,As,n2);
568: PetscArrayzero(RR,n2);
569: for (j=0;j<n;j++) {
570: Maux[j+j*n] -= p[ipsize-1];
571: RR[j+j*n] = r[irsize-1];
572: }
573: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
574: SlepcCheckLapackInfo("gesv",info);
575: for (j=0;j<n2;j++) {
576: expmA[j] += RR[j];
577: }
578: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
579: }
580: } else { /* complex */
581: for (i=0;i<irsize;i++) { /* use partial fraction to get R(As) */
582: PetscArraycpy(Maux,As,n2);
583: PetscArrayzero(RR,n2);
584: for (j=0;j<n;j++) {
585: Maux[j+j*n] -= p[i];
586: RR[j+j*n] = r[i];
587: }
588: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,Maux,&n,piv,RR,&n,&info));
589: SlepcCheckLapackInfo("gesv",info);
590: for (j=0;j<n2;j++) {
591: expmA[j] += RR[j];
592: }
593: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n)+1.0*n2);
594: }
595: }
596: for (i=0;i<iremainsize;i++) {
597: if (!i) {
598: PetscArrayzero(RR,n2);
599: for (j=0;j<n;j++) {
600: RR[j+j*n] = remainterm[iremainsize-1];
601: }
602: } else {
603: PetscArraycpy(RR,As,n2);
604: for (j=1;j<i;j++) {
605: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,RR,&n,&czero,Maux,&n));
606: SWAP(RR,Maux,aux);
607: SlepcLogFlopsComplex(2.0*n*n*n);
608: }
609: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&remainterm[iremainsize-1-i],RR,&one));
610: SlepcLogFlopsComplex(1.0*n2);
611: }
612: for (j=0;j<n2;j++) {
613: expmA[j] += RR[j];
614: }
615: SlepcLogFlopsComplex(1.0*n2);
616: }
617: PetscFree3(r,p,remainterm);
618: } else { /* product form, default */
619: getcoeffsproduct(k,m,&rsize,&psize,&mult,PETSC_TRUE);
620: PetscBLASIntCast((PetscInt)PetscRealPartComplex(rsize),&irsize);
621: PetscBLASIntCast((PetscInt)PetscRealPartComplex(psize),&ipsize);
622: PetscMalloc2(irsize,&rootp,ipsize,&rootq);
623: getcoeffsproduct(k,m,rootp,rootq,&mult,PETSC_FALSE);
625: PetscArrayzero(expmA,n2);
626: for (i=0;i<n;i++) { /* initialize */
627: expmA[i+i*n] = 1.0;
628: }
629: minlen = PetscMin(irsize,ipsize);
630: for (i=0;i<minlen;i++) {
631: PetscArraycpy(RR,As,n2);
632: for (j=0;j<n;j++) {
633: RR[j+j*n] -= rootp[i];
634: }
635: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
636: SWAP(expmA,Maux,aux);
637: PetscArraycpy(RR,As,n2);
638: for (j=0;j<n;j++) {
639: RR[j+j*n] -= rootq[i];
640: }
641: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
642: SlepcCheckLapackInfo("gesv",info);
643: /* loop(n) + gemm + loop(n) + gesv */
644: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n)+1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
645: }
646: /* extra numerator */
647: for (i=minlen;i<irsize;i++) {
648: PetscArraycpy(RR,As,n2);
649: for (j=0;j<n;j++) {
650: RR[j+j*n] -= rootp[i];
651: }
652: PetscStackCallBLAS("BLASCOMPLEXgemm",BLASCOMPLEXgemm_("N","N",&n,&n,&n,&cone,RR,&n,expmA,&n,&czero,Maux,&n));
653: SWAP(expmA,Maux,aux);
654: SlepcLogFlopsComplex(1.0*n+2.0*n*n*n);
655: }
656: /* extra denominator */
657: for (i=minlen;i<ipsize;i++) {
658: PetscArraycpy(RR,As,n2);
659: for (j=0;j<n;j++) RR[j+j*n] -= rootq[i];
660: PetscStackCallBLAS("LAPACKCOMPLEXgesv",LAPACKCOMPLEXgesv_(&n,&n,RR,&n,piv,expmA,&n,&info));
661: SlepcCheckLapackInfo("gesv",info);
662: SlepcLogFlopsComplex(1.0*n+(2.0*n*n*n/3.0+2.0*n*n*n));
663: }
664: PetscStackCallBLAS("BLASCOMPLEXscal",BLASCOMPLEXscal_(&n2,&mult,expmA,&one));
665: SlepcLogFlopsComplex(1.0*n2);
666: PetscFree2(rootp,rootq);
667: }
669: #if !defined(PETSC_USE_COMPLEX)
670: for (i=0;i<n2;i++) {
671: Ba2[i] = PetscRealPartComplex(expmA[i]);
672: }
673: #else
674: PetscArraycpy(Ba2,expmA,n2);
675: #endif
677: /* perform repeated squaring */
678: for (i=0;i<s;i++) { /* final squaring */
679: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,Ba2,&n,Ba2,&n,&szero,sMaux,&n));
680: SWAP(Ba2,sMaux,saux);
681: PetscLogFlops(2.0*n*n*n);
682: }
683: if (Ba2!=Ba) {
684: PetscArraycpy(Ba,Ba2,n2);
685: sMaux = Ba2;
686: }
687: expshift = PetscExpReal(shift);
688: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&expshift,Ba,&one));
689: PetscLogFlops(1.0*n2);
691: /* restore pointers */
692: Maux = Maux2; expmA = expmA2; RR = RR2;
693: PetscFree2(sMaux,Maux);
694: PetscFree4(expmA,As,RR,piv);
695: MatDenseRestoreArray(A,&Aa);
696: MatDenseRestoreArray(B,&Ba);
697: return(0);
698: #endif
699: }
701: #define SMALLN 100
703: /*
704: * Function needed to compute optimal parameters (required workspace is 3*n*n)
705: */
706: static PetscInt ell(PetscBLASInt n,PetscScalar *A,PetscReal coeff,PetscInt m,PetscScalar *work,PetscRandom rand)
707: {
708: PetscScalar *Ascaled=work;
709: PetscReal nrm,alpha,beta,rwork[1];
710: PetscInt t;
711: PetscBLASInt i,j;
715: beta = PetscPowReal(coeff,1.0/(2*m+1));
716: for (i=0;i<n;i++)
717: for (j=0;j<n;j++)
718: Ascaled[i+j*n] = beta*PetscAbsScalar(A[i+j*n]);
719: nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
720: PetscLogFlops(2.0*n*n);
721: SlepcNormAm(n,Ascaled,2*m+1,work+n*n,rand,&alpha);
722: alpha /= nrm;
723: t = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(2.0*alpha/PETSC_MACHINE_EPSILON)/PetscLogReal(2.0)/(2*m)),0);
724: PetscFunctionReturn(t);
725: }
727: /*
728: * Compute scaling parameter (s) and order of Pade approximant (m) (required workspace is 4*n*n)
729: */
730: static PetscErrorCode expm_params(PetscInt n,PetscScalar **Apowers,PetscInt *s,PetscInt *m,PetscScalar *work)
731: {
732: PetscErrorCode ierr;
733: PetscScalar sfactor,sone=1.0,szero=0.0,*A=Apowers[0],*Ascaled;
734: PetscReal d4,d6,d8,d10,eta1,eta3,eta4,eta5,rwork[1];
735: PetscBLASInt n_,n2,one=1;
736: PetscRandom rand;
737: const PetscReal coeff[5] = { 9.92063492063492e-06, 9.94131285136576e-11, /* backward error function */
738: 2.22819456055356e-16, 1.69079293431187e-22, 8.82996160201868e-36 };
739: const PetscReal theta[5] = { 1.495585217958292e-002, /* m = 3 */
740: 2.539398330063230e-001, /* m = 5 */
741: 9.504178996162932e-001, /* m = 7 */
742: 2.097847961257068e+000, /* m = 9 */
743: 5.371920351148152e+000 }; /* m = 13 */
746: *s = 0;
747: *m = 13;
748: PetscBLASIntCast(n,&n_);
749: PetscRandomCreate(PETSC_COMM_SELF,&rand);
750: d4 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[2],&n_,rwork),1.0/4.0);
751: if (d4==0.0) { /* safeguard for the case A = 0 */
752: *m = 3;
753: goto done;
754: }
755: d6 = PetscPowReal(LAPACKlange_("O",&n_,&n_,Apowers[3],&n_,rwork),1.0/6.0);
756: PetscLogFlops(2.0*n*n);
757: eta1 = PetscMax(d4,d6);
758: if (eta1<=theta[0] && !ell(n_,A,coeff[0],3,work,rand)) {
759: *m = 3;
760: goto done;
761: }
762: if (eta1<=theta[1] && !ell(n_,A,coeff[1],5,work,rand)) {
763: *m = 5;
764: goto done;
765: }
766: if (n<SMALLN) {
767: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[2],&n_,&szero,work,&n_));
768: d8 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/8.0);
769: PetscLogFlops(2.0*n*n*n+1.0*n*n);
770: } else {
771: SlepcNormAm(n_,Apowers[2],2,work,rand,&d8);
772: d8 = PetscPowReal(d8,1.0/8.0);
773: }
774: eta3 = PetscMax(d6,d8);
775: if (eta3<=theta[2] && !ell(n_,A,coeff[2],7,work,rand)) {
776: *m = 7;
777: goto done;
778: }
779: if (eta3<=theta[3] && !ell(n_,A,coeff[3],9,work,rand)) {
780: *m = 9;
781: goto done;
782: }
783: if (n<SMALLN) {
784: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[2],&n_,Apowers[3],&n_,&szero,work,&n_));
785: d10 = PetscPowReal(LAPACKlange_("O",&n_,&n_,work,&n_,rwork),1.0/10.0);
786: PetscLogFlops(2.0*n*n*n+1.0*n*n);
787: } else {
788: SlepcNormAm(n_,Apowers[1],5,work,rand,&d10);
789: d10 = PetscPowReal(d10,1.0/10.0);
790: }
791: eta4 = PetscMax(d8,d10);
792: eta5 = PetscMin(eta3,eta4);
793: *s = PetscMax((PetscInt)PetscCeilReal(PetscLogReal(eta5/theta[4])/PetscLogReal(2.0)),0);
794: if (*s) {
795: Ascaled = work+3*n*n;
796: n2 = n_*n_;
797: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,A,&one,Ascaled,&one));
798: sfactor = PetscPowRealInt(2.0,-(*s));
799: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&sfactor,Ascaled,&one));
800: PetscLogFlops(1.0*n*n);
801: } else Ascaled = A;
802: *s += ell(n_,Ascaled,coeff[4],13,work,rand);
803: done:
804: PetscRandomDestroy(&rand);
805: return(0);
806: }
808: /*
809: * Matrix exponential implementation based on algorithm and matlab code by N. Higham and co-authors
810: *
811: * N. J. Higham, "The scaling and squaring method for the matrix exponential
812: * revisited", SIAM J. Matrix Anal. Appl. 26(4):1179-1193, 2005.
813: */
814: PetscErrorCode FNEvaluateFunctionMat_Exp_Higham(FN fn,Mat A,Mat B)
815: {
816: PetscErrorCode ierr;
817: PetscBLASInt n_,n2,*ipiv,info,one=1;
818: PetscInt n,m,j,s;
819: PetscScalar scale,smone=-1.0,sone=1.0,stwo=2.0,szero=0.0;
820: PetscScalar *Aa,*Ba,*Apowers[5],*Q,*P,*W,*work,*aux;
821: const PetscScalar *c;
822: const PetscScalar c3[4] = { 120, 60, 12, 1 };
823: const PetscScalar c5[6] = { 30240, 15120, 3360, 420, 30, 1 };
824: const PetscScalar c7[8] = { 17297280, 8648640, 1995840, 277200, 25200, 1512, 56, 1 };
825: const PetscScalar c9[10] = { 17643225600, 8821612800, 2075673600, 302702400, 30270240,
826: 2162160, 110880, 3960, 90, 1 };
827: const PetscScalar c13[14] = { 64764752532480000, 32382376266240000, 7771770303897600,
828: 1187353796428800, 129060195264000, 10559470521600,
829: 670442572800, 33522128640, 1323241920,
830: 40840800, 960960, 16380, 182, 1 };
833: MatDenseGetArray(A,&Aa);
834: MatDenseGetArray(B,&Ba);
835: MatGetSize(A,&n,NULL);
836: PetscBLASIntCast(n,&n_);
837: n2 = n_*n_;
838: PetscMalloc2(8*n*n,&work,n,&ipiv);
840: /* Matrix powers */
841: Apowers[0] = work; /* Apowers[0] = A */
842: Apowers[1] = Apowers[0] + n*n; /* Apowers[1] = A^2 */
843: Apowers[2] = Apowers[1] + n*n; /* Apowers[2] = A^4 */
844: Apowers[3] = Apowers[2] + n*n; /* Apowers[3] = A^6 */
845: Apowers[4] = Apowers[3] + n*n; /* Apowers[4] = A^8 */
847: PetscArraycpy(Apowers[0],Aa,n2);
848: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,Apowers[0],&n_,&szero,Apowers[1],&n_));
849: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[1],&n_,&szero,Apowers[2],&n_));
850: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[2],&n_,&szero,Apowers[3],&n_));
851: PetscLogFlops(6.0*n*n*n);
853: /* Compute scaling parameter and order of Pade approximant */
854: expm_params(n,Apowers,&s,&m,Apowers[4]);
856: if (s) { /* rescale */
857: for (j=0;j<4;j++) {
858: scale = PetscPowRealInt(2.0,-PetscMax(2*j,1)*s);
859: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&scale,Apowers[j],&one));
860: }
861: PetscLogFlops(4.0*n*n);
862: }
864: /* Evaluate the Pade approximant */
865: switch (m) {
866: case 3: c = c3; break;
867: case 5: c = c5; break;
868: case 7: c = c7; break;
869: case 9: c = c9; break;
870: case 13: c = c13; break;
871: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
872: }
873: P = Ba;
874: Q = Apowers[4] + n*n;
875: W = Q + n*n;
876: switch (m) {
877: case 3:
878: case 5:
879: case 7:
880: case 9:
881: if (m==9) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[1],&n_,Apowers[3],&n_,&szero,Apowers[4],&n_));
882: PetscArrayzero(P,n2);
883: PetscArrayzero(Q,n2);
884: for (j=0;j<n;j++) {
885: P[j+j*n] = c[1];
886: Q[j+j*n] = c[0];
887: }
888: for (j=m;j>=3;j-=2) {
889: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j],Apowers[(j+1)/2-1],&one,P,&one));
890: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[j-1],Apowers[(j+1)/2-1],&one,Q,&one));
891: PetscLogFlops(4.0*n*n);
892: }
893: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,P,&n_,&szero,W,&n_));
894: PetscLogFlops(2.0*n*n*n);
895: SWAP(P,W,aux);
896: break;
897: case 13:
898: /* P = A*(Apowers[3]*(c[13]*Apowers[3] + c[11]*Apowers[2] + c[9]*Apowers[1])
899: + c[7]*Apowers[3] + c[5]*Apowers[2] + c[3]*Apowers[1] + c[1]*I) */
900: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,P,&one));
901: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[13],P,&one));
902: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[11],Apowers[2],&one,P,&one));
903: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[9],Apowers[1],&one,P,&one));
904: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,P,&n_,&szero,W,&n_));
905: PetscLogFlops(5.0*n*n+2.0*n*n*n);
906: PetscArrayzero(P,n2);
907: for (j=0;j<n;j++) P[j+j*n] = c[1];
908: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[7],Apowers[3],&one,P,&one));
909: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[5],Apowers[2],&one,P,&one));
910: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[3],Apowers[1],&one,P,&one));
911: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,P,&one,W,&one));
912: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[0],&n_,W,&n_,&szero,P,&n_));
913: PetscLogFlops(7.0*n*n+2.0*n*n*n);
914: /* Q = Apowers[3]*(c[12]*Apowers[3] + c[10]*Apowers[2] + c[8]*Apowers[1])
915: + c[6]*Apowers[3] + c[4]*Apowers[2] + c[2]*Apowers[1] + c[0]*I */
916: PetscStackCallBLAS("BLAScopy",BLAScopy_(&n2,Apowers[3],&one,Q,&one));
917: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&c[12],Q,&one));
918: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[10],Apowers[2],&one,Q,&one));
919: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[8],Apowers[1],&one,Q,&one));
920: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,Apowers[3],&n_,Q,&n_,&szero,W,&n_));
921: PetscLogFlops(5.0*n*n+2.0*n*n*n);
922: PetscArrayzero(Q,n2);
923: for (j=0;j<n;j++) Q[j+j*n] = c[0];
924: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[6],Apowers[3],&one,Q,&one));
925: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[4],Apowers[2],&one,Q,&one));
926: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&c[2],Apowers[1],&one,Q,&one));
927: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&sone,W,&one,Q,&one));
928: PetscLogFlops(7.0*n*n);
929: break;
930: default: SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONG,"Wrong value of m %d",m);
931: }
932: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&n2,&smone,P,&one,Q,&one));
933: PetscStackCallBLAS("LAPACKgesv",LAPACKgesv_(&n_,&n_,Q,&n_,ipiv,P,&n_,&info));
934: SlepcCheckLapackInfo("gesv",info);
935: PetscStackCallBLAS("BLASscal",BLASscal_(&n2,&stwo,P,&one));
936: for (j=0;j<n;j++) P[j+j*n] += 1.0;
937: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n);
939: /* Squaring */
940: for (j=1;j<=s;j++) {
941: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n_,&n_,&n_,&sone,P,&n_,P,&n_,&szero,W,&n_));
942: SWAP(P,W,aux);
943: }
944: if (P!=Ba) { PetscArraycpy(Ba,P,n2); }
945: PetscLogFlops(2.0*n*n*n*s);
947: PetscFree2(work,ipiv);
948: MatDenseRestoreArray(A,&Aa);
949: MatDenseRestoreArray(B,&Ba);
950: return(0);
951: }
953: PetscErrorCode FNView_Exp(FN fn,PetscViewer viewer)
954: {
956: PetscBool isascii;
957: char str[50];
958: const char *methodname[] = {
959: "scaling & squaring, [m/m] Pade approximant (Higham)",
960: "scaling & squaring, [6/6] Pade approximant",
961: "scaling & squaring, subdiagonal Pade approximant (product form)",
962: "scaling & squaring, subdiagonal Pade approximant (partial fraction)"
963: };
964: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
967: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
968: if (isascii) {
969: if (fn->beta==(PetscScalar)1.0) {
970: if (fn->alpha==(PetscScalar)1.0) {
971: PetscViewerASCIIPrintf(viewer," Exponential: exp(x)\n");
972: } else {
973: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
974: PetscViewerASCIIPrintf(viewer," Exponential: exp(%s*x)\n",str);
975: }
976: } else {
977: SlepcSNPrintfScalar(str,50,fn->beta,PETSC_TRUE);
978: if (fn->alpha==(PetscScalar)1.0) {
979: PetscViewerASCIIPrintf(viewer," Exponential: %s*exp(x)\n",str);
980: } else {
981: PetscViewerASCIIPrintf(viewer," Exponential: %s",str);
982: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
983: SlepcSNPrintfScalar(str,50,fn->alpha,PETSC_TRUE);
984: PetscViewerASCIIPrintf(viewer,"*exp(%s*x)\n",str);
985: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
986: }
987: }
988: if (fn->method<nmeth) {
989: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
990: }
991: }
992: return(0);
993: }
995: SLEPC_EXTERN PetscErrorCode FNCreate_Exp(FN fn)
996: {
998: fn->ops->evaluatefunction = FNEvaluateFunction_Exp;
999: fn->ops->evaluatederivative = FNEvaluateDerivative_Exp;
1000: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Exp_Higham;
1001: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Exp_Pade;
1002: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* product form */
1003: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Exp_GuettelNakatsukasa; /* partial fraction */
1004: fn->ops->view = FNView_Exp;
1005: return(0);
1006: }