Actual source code: fnutil.c
slepc-3.17.2 2022-08-09
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Utility subroutines common to several impls
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: /*
18: Compute the square root of an upper quasi-triangular matrix T,
19: using Higham's algorithm (LAA 88, 1987). T is overwritten with sqrtm(T).
20: */
21: static PetscErrorCode SlepcMatDenseSqrt(PetscBLASInt n,PetscScalar *T,PetscBLASInt ld)
22: {
23: PetscScalar one=1.0,mone=-1.0;
24: PetscReal scal;
25: PetscBLASInt i,j,si,sj,r,ione=1,info;
26: #if !defined(PETSC_USE_COMPLEX)
27: PetscReal alpha,theta,mu,mu2;
28: #endif
30: for (j=0;j<n;j++) {
31: #if defined(PETSC_USE_COMPLEX)
32: sj = 1;
33: T[j+j*ld] = PetscSqrtScalar(T[j+j*ld]);
34: #else
35: sj = (j==n-1 || T[j+1+j*ld] == 0.0)? 1: 2;
36: if (sj==1) {
38: T[j+j*ld] = PetscSqrtReal(T[j+j*ld]);
39: } else {
40: /* square root of 2x2 block */
41: theta = (T[j+j*ld]+T[j+1+(j+1)*ld])/2.0;
42: mu = (T[j+j*ld]-T[j+1+(j+1)*ld])/2.0;
43: mu2 = -mu*mu-T[j+1+j*ld]*T[j+(j+1)*ld];
44: mu = PetscSqrtReal(mu2);
45: if (theta>0.0) alpha = PetscSqrtReal((theta+PetscSqrtReal(theta*theta+mu2))/2.0);
46: else alpha = mu/PetscSqrtReal(2.0*(-theta+PetscSqrtReal(theta*theta+mu2)));
47: T[j+j*ld] /= 2.0*alpha;
48: T[j+1+(j+1)*ld] /= 2.0*alpha;
49: T[j+(j+1)*ld] /= 2.0*alpha;
50: T[j+1+j*ld] /= 2.0*alpha;
51: T[j+j*ld] += alpha-theta/(2.0*alpha);
52: T[j+1+(j+1)*ld] += alpha-theta/(2.0*alpha);
53: }
54: #endif
55: for (i=j-1;i>=0;i--) {
56: #if defined(PETSC_USE_COMPLEX)
57: si = 1;
58: #else
59: si = (i==0 || T[i+(i-1)*ld] == 0.0)? 1: 2;
60: if (si==2) i--;
61: #endif
62: /* solve Sylvester equation of order si x sj */
63: r = j-i-si;
64: if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&si,&sj,&r,&mone,T+i+(i+si)*ld,&ld,T+i+si+j*ld,&ld,&one,T+i+j*ld,&ld));
65: PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,&si,&sj,T+i+i*ld,&ld,T+j+j*ld,&ld,T+i+j*ld,&ld,&scal,&info));
66: SlepcCheckLapackInfo("trsyl",info);
68: }
69: if (sj==2) j++;
70: }
71: PetscFunctionReturn(0);
72: }
74: #define BLOCKSIZE 64
76: /*
77: Schur method for the square root of an upper quasi-triangular matrix T.
78: T is overwritten with sqrtm(T).
79: If firstonly then only the first column of T will contain relevant values.
80: */
81: PetscErrorCode FNSqrtmSchur(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool firstonly)
82: {
83: PetscBLASInt i,j,k,r,ione=1,sdim,lwork,*s,*p,info,bs=BLOCKSIZE;
84: PetscScalar *wr,*W,*Q,*work,one=1.0,zero=0.0,mone=-1.0;
85: PetscInt m,nblk;
86: PetscReal scal;
87: #if defined(PETSC_USE_COMPLEX)
88: PetscReal *rwork;
89: #else
90: PetscReal *wi;
91: #endif
93: m = n;
94: nblk = (m+bs-1)/bs;
95: lwork = 5*n;
96: k = firstonly? 1: n;
98: /* compute Schur decomposition A*Q = Q*T */
99: #if !defined(PETSC_USE_COMPLEX)
100: PetscMalloc7(m,&wr,m,&wi,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
101: PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,wi,Q,&ld,work,&lwork,NULL,&info));
102: #else
103: PetscMalloc7(m,&wr,m,&rwork,m*k,&W,m*m,&Q,lwork,&work,nblk,&s,nblk,&p);
104: PetscStackCallBLAS("LAPACKgees",LAPACKgees_("V","N",NULL,&n,T,&ld,&sdim,wr,Q,&ld,work,&lwork,rwork,NULL,&info));
105: #endif
106: SlepcCheckLapackInfo("gees",info);
108: /* determine block sizes and positions, to avoid cutting 2x2 blocks */
109: j = 0;
110: p[j] = 0;
111: do {
112: s[j] = PetscMin(bs,n-p[j]);
113: #if !defined(PETSC_USE_COMPLEX)
114: if (p[j]+s[j]!=n && T[p[j]+s[j]+(p[j]+s[j]-1)*ld]!=0.0) s[j]++;
115: #endif
116: if (p[j]+s[j]==n) break;
117: j++;
118: p[j] = p[j-1]+s[j-1];
119: } while (1);
120: nblk = j+1;
122: for (j=0;j<nblk;j++) {
123: /* evaluate f(T_jj) */
124: SlepcMatDenseSqrt(s[j],T+p[j]+p[j]*ld,ld);
125: for (i=j-1;i>=0;i--) {
126: /* solve Sylvester equation for block (i,j) */
127: r = p[j]-p[i]-s[i];
128: if (r) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",s+i,s+j,&r,&mone,T+p[i]+(p[i]+s[i])*ld,&ld,T+p[i]+s[i]+p[j]*ld,&ld,&one,T+p[i]+p[j]*ld,&ld));
129: PetscStackCallBLAS("LAPACKtrsyl",LAPACKtrsyl_("N","N",&ione,s+i,s+j,T+p[i]+p[i]*ld,&ld,T+p[j]+p[j]*ld,&ld,T+p[i]+p[j]*ld,&ld,&scal,&info));
130: SlepcCheckLapackInfo("trsyl",info);
132: }
133: }
135: /* backtransform B = Q*T*Q' */
136: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","C",&n,&k,&n,&one,T,&ld,Q,&ld,&zero,W,&ld));
137: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&k,&n,&one,Q,&ld,W,&ld,&zero,T,&ld));
139: /* flop count: Schur decomposition, triangular square root, and backtransform */
140: PetscLogFlops(25.0*n*n*n+n*n*n/3.0+4.0*n*n*k);
142: #if !defined(PETSC_USE_COMPLEX)
143: PetscFree7(wr,wi,W,Q,work,s,p);
144: #else
145: PetscFree7(wr,rwork,W,Q,work,s,p);
146: #endif
147: PetscFunctionReturn(0);
148: }
150: #define DBMAXIT 25
152: /*
153: Computes the principal square root of the matrix T using the product form
154: of the Denman-Beavers iteration.
155: T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
156: */
157: PetscErrorCode FNSqrtmDenmanBeavers(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
158: {
159: PetscScalar *Told,*M=NULL,*invM,*work,work1,prod,alpha;
160: PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sp25=0.25;
161: PetscReal tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,rwork[1];
162: PetscBLASInt N,i,it,*piv=NULL,info,query=-1,lwork;
163: const PetscBLASInt one=1;
164: PetscBool converged=PETSC_FALSE,scale=PETSC_FALSE;
165: unsigned int ftz;
167: N = n*n;
168: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
169: SlepcSetFlushToZero(&ftz);
171: /* query work size */
172: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M,&ld,piv,&work1,&query,&info));
173: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
174: PetscMalloc5(lwork,&work,n,&piv,n*n,&Told,n*n,&M,n*n,&invM);
175: PetscArraycpy(M,T,n*n);
177: if (inv) { /* start recurrence with I instead of A */
178: PetscArrayzero(T,n*n);
179: for (i=0;i<n;i++) T[i+i*ld] += 1.0;
180: }
182: for (it=0;it<DBMAXIT && !converged;it++) {
184: if (scale) { /* g = (abs(det(M)))^(-1/(2*n)) */
185: PetscArraycpy(invM,M,n*n);
186: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
187: SlepcCheckLapackInfo("getrf",info);
188: prod = invM[0];
189: for (i=1;i<n;i++) prod *= invM[i+i*ld];
190: detM = PetscAbsScalar(prod);
191: g = PetscPowReal(detM,-1.0/(2.0*n));
192: alpha = g;
193: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,T,&one));
194: alpha = g*g;
195: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&alpha,M,&one));
196: PetscLogFlops(2.0*n*n*n/3.0+2.0*n*n);
197: }
199: PetscArraycpy(Told,T,n*n);
200: PetscArraycpy(invM,M,n*n);
202: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,invM,&ld,piv,&info));
203: SlepcCheckLapackInfo("getrf",info);
204: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,invM,&ld,piv,work,&lwork,&info));
205: SlepcCheckLapackInfo("getri",info);
206: PetscLogFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);
208: for (i=0;i<n;i++) invM[i+i*ld] += 1.0;
209: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Told,&ld,invM,&ld,&szero,T,&ld));
210: for (i=0;i<n;i++) invM[i+i*ld] -= 1.0;
212: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&sone,invM,&one,M,&one));
213: PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sp25,M,&one));
214: for (i=0;i<n;i++) M[i+i*ld] -= 0.5;
215: PetscLogFlops(2.0*n*n*n+2.0*n*n);
217: Mres = LAPACKlange_("F",&n,&n,M,&n,rwork);
218: for (i=0;i<n;i++) M[i+i*ld] += 1.0;
220: if (scale) {
221: /* reldiff = norm(T - Told,'fro')/norm(T,'fro') */
222: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,T,&one,Told,&one));
223: fnormdiff = LAPACKlange_("F",&n,&n,Told,&n,rwork);
224: fnormT = LAPACKlange_("F",&n,&n,T,&n,rwork);
225: PetscLogFlops(7.0*n*n);
226: reldiff = fnormdiff/fnormT;
227: PetscInfo(fn,"it: %" PetscBLASInt_FMT " reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)(tol*g));
228: if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch off scaling */
229: }
231: if (Mres<=tol) converged = PETSC_TRUE;
232: }
235: PetscFree5(work,piv,Told,M,invM);
236: SlepcResetFlushToZero(&ftz);
237: PetscFunctionReturn(0);
238: }
240: #define NSMAXIT 50
242: /*
243: Computes the principal square root of the matrix A using the Newton-Schulz iteration.
244: T is overwritten with sqrtm(T) or inv(sqrtm(T)) depending on flag inv.
245: */
246: PetscErrorCode FNSqrtmNewtonSchulz(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
247: {
248: PetscScalar *Y=A,*Yold,*Z,*Zold,*M;
249: PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
250: PetscReal sqrtnrm,tol,Yres=0.0,nrm,rwork[1],done=1.0;
251: PetscBLASInt info,i,it,N,one=1,zero=0;
252: PetscBool converged=PETSC_FALSE;
253: unsigned int ftz;
255: N = n*n;
256: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
257: SlepcSetFlushToZero(&ftz);
259: PetscMalloc4(N,&Yold,N,&Z,N,&Zold,N,&M);
261: /* scale A so that ||I-A|| < 1 */
262: PetscArraycpy(Z,A,N);
263: for (i=0;i<n;i++) Z[i+i*ld] -= 1.0;
264: nrm = LAPACKlange_("fro",&n,&n,Z,&n,rwork);
265: sqrtnrm = PetscSqrtReal(nrm);
266: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,A,&N,&info));
267: SlepcCheckLapackInfo("lascl",info);
268: tol *= nrm;
269: PetscInfo(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
270: PetscLogFlops(2.0*n*n);
272: /* Z = I */
273: PetscArrayzero(Z,N);
274: for (i=0;i<n;i++) Z[i+i*ld] = 1.0;
276: for (it=0;it<NSMAXIT && !converged;it++) {
277: /* Yold = Y, Zold = Z */
278: PetscArraycpy(Yold,Y,N);
279: PetscArraycpy(Zold,Z,N);
281: /* M = (3*I-Zold*Yold) */
282: PetscArrayzero(M,N);
283: for (i=0;i<n;i++) M[i+i*ld] = sthree;
284: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&smone,Zold,&ld,Yold,&ld,&sone,M,&ld));
286: /* Y = (1/2)*Yold*M, Z = (1/2)*M*Zold */
287: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,Yold,&ld,M,&ld,&szero,Y,&ld));
288: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&spfive,M,&ld,Zold,&ld,&szero,Z,&ld));
290: /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
291: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smone,Y,&one,Yold,&one));
292: Yres = LAPACKlange_("fro",&n,&n,Yold,&n,rwork);
294: if (Yres<=tol) converged = PETSC_TRUE;
295: PetscInfo(fn,"it: %" PetscBLASInt_FMT " res: %g\n",it,(double)Yres);
297: PetscLogFlops(6.0*n*n*n+2.0*n*n);
298: }
302: /* undo scaling */
303: if (inv) {
304: PetscArraycpy(A,Z,N);
305: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&sqrtnrm,&done,&N,&one,A,&N,&info));
306: } else PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&done,&sqrtnrm,&N,&one,A,&N,&info));
307: SlepcCheckLapackInfo("lascl",info);
309: PetscFree4(Yold,Z,Zold,M);
310: SlepcResetFlushToZero(&ftz);
311: PetscFunctionReturn(0);
312: }
314: #if defined(PETSC_HAVE_CUDA)
315: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
316: #include <slepccublas.h>
318: /*
319: * Matrix square root by Newton-Schulz iteration. CUDA version.
320: * Computes the principal square root of the matrix T using the
321: * Newton-Schulz iteration. T is overwritten with sqrtm(T).
322: */
323: PetscErrorCode FNSqrtmNewtonSchulz_CUDA(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld,PetscBool inv)
324: {
325: PetscScalar *d_A,*d_Yold,*d_Z,*d_Zold,*d_M,alpha;
326: PetscReal nrm,sqrtnrm,tol,Yres=0.0;
327: const PetscScalar szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sthree=3.0;
328: PetscInt it;
329: PetscBLASInt N;
330: const PetscBLASInt one=1;
331: PetscBool converged=PETSC_FALSE;
332: cublasHandle_t cublasv2handle;
334: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
335: PetscCUBLASGetHandle(&cublasv2handle);
336: N = n*n;
337: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
339: cudaMalloc((void **)&d_A,sizeof(PetscScalar)*N);
340: cudaMalloc((void **)&d_Yold,sizeof(PetscScalar)*N);
341: cudaMalloc((void **)&d_Z,sizeof(PetscScalar)*N);
342: cudaMalloc((void **)&d_Zold,sizeof(PetscScalar)*N);
343: cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);
345: PetscLogGpuTimeBegin();
347: /* Y = A; */
348: cudaMemcpy(d_A,A,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);
349: /* Z = I; */
350: cudaMemset(d_Z,0,sizeof(PetscScalar)*N);
351: set_diagonal(n,d_Z,ld,sone);
353: /* scale A so that ||I-A|| < 1 */
354: cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Z,one);
355: cublasXnrm2(cublasv2handle,N,d_Z,one,&nrm);
356: sqrtnrm = PetscSqrtReal(nrm);
357: alpha = 1.0/nrm;
358: cublasXscal(cublasv2handle,N,&alpha,d_A,one);
359: tol *= nrm;
360: PetscInfo(fn,"||I-A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
361: PetscLogGpuFlops(2.0*n*n);
363: /* Z = I; */
364: cudaMemset(d_Z,0,sizeof(PetscScalar)*N);
365: set_diagonal(n,d_Z,ld,sone);
367: for (it=0;it<NSMAXIT && !converged;it++) {
368: /* Yold = Y, Zold = Z */
369: cudaMemcpy(d_Yold,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
370: cudaMemcpy(d_Zold,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
372: /* M = (3*I - Zold*Yold) */
373: cudaMemset(d_M,0,sizeof(PetscScalar)*N);
374: set_diagonal(n,d_M,ld,sthree);
375: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&smone,d_Zold,ld,d_Yold,ld,&sone,d_M,ld);
377: /* Y = (1/2) * Yold * M, Z = (1/2) * M * Zold */
378: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Yold,ld,d_M,ld,&szero,d_A,ld);
379: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_M,ld,d_Zold,ld,&szero,d_Z,ld);
381: /* reldiff = norm(Y-Yold,'fro')/norm(Y,'fro') */
382: cublasXaxpy(cublasv2handle,N,&smone,d_A,one,d_Yold,one);
383: cublasXnrm2(cublasv2handle,N,d_Yold,one,&Yres);
385: if (Yres<=tol) converged = PETSC_TRUE;
386: PetscInfo(fn,"it: %" PetscInt_FMT " res: %g\n",it,(double)Yres);
388: PetscLogGpuFlops(6.0*n*n*n+2.0*n*n);
389: }
393: /* undo scaling */
394: if (inv) {
395: alpha = 1.0/sqrtnrm;
396: cublasXscal(cublasv2handle,N,&alpha,d_Z,one);
397: cudaMemcpy(A,d_Z,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);
398: } else {
399: alpha = sqrtnrm;
400: cublasXscal(cublasv2handle,N,&alpha,d_A,one);
401: cudaMemcpy(A,d_A,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);
402: }
404: PetscLogGpuTimeEnd();
405: cudaFree(d_A);
406: cudaFree(d_Yold);
407: cudaFree(d_Z);
408: cudaFree(d_Zold);
409: cudaFree(d_M);
410: PetscFunctionReturn(0);
411: }
413: #if defined(PETSC_HAVE_MAGMA)
414: #include <slepcmagma.h>
416: /*
417: * Matrix square root by product form of Denman-Beavers iteration. CUDA version.
418: * Computes the principal square root of the matrix T using the product form
419: * of the Denman-Beavers iteration. T is overwritten with sqrtm(T).
420: */
421: PetscErrorCode FNSqrtmDenmanBeavers_CUDAm(FN fn,PetscBLASInt n,PetscScalar *T,PetscBLASInt ld,PetscBool inv)
422: {
423: PetscScalar *d_T,*d_Told,*d_M,*d_invM,*d_work,szero=0.0,sone=1.0,smone=-1.0,spfive=0.5,sneg_pfive=-0.5,sp25=0.25,alpha;
424: PetscReal tol,Mres=0.0,detM,g,reldiff,fnormdiff,fnormT,prod;
425: PetscInt i,it,lwork,nb;
426: PetscBLASInt N,one=1,*piv=NULL;
427: PetscBool converged=PETSC_FALSE,scale=PETSC_FALSE;
428: cublasHandle_t cublasv2handle;
430: PetscDeviceInitialize(PETSC_DEVICE_CUDA); /* For CUDA event timers */
431: PetscCUBLASGetHandle(&cublasv2handle);
432: magma_init();
433: N = n*n;
434: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
436: /* query work size */
437: nb = magma_get_xgetri_nb(n);
438: lwork = nb*n;
439: PetscMalloc1(n,&piv);
440: cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork);
441: cudaMalloc((void **)&d_T,sizeof(PetscScalar)*N);
442: cudaMalloc((void **)&d_Told,sizeof(PetscScalar)*N);
443: cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);
444: cudaMalloc((void **)&d_invM,sizeof(PetscScalar)*N);
446: PetscLogGpuTimeBegin();
447: cudaMemcpy(d_M,T,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);
448: if (inv) { /* start recurrence with I instead of A */
449: cudaMemset(d_T,0,sizeof(PetscScalar)*N);
450: set_diagonal(n,d_T,ld,1.0);
451: } else cudaMemcpy(d_T,T,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);
453: for (it=0;it<DBMAXIT && !converged;it++) {
455: if (scale) { /* g = (abs(det(M)))^(-1/(2*n)); */
456: cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
457: magma_xgetrf_gpu,n,n,d_invM,ld,piv;
459: /* XXX pending */
460: // mult_diagonal(d_invM,n,ld,&detM);
461: cudaMemcpy(T,d_invM,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);
462: prod = T[0];
463: for (i=1;i<n;i++) { prod *= T[i+i*ld]; }
464: detM = PetscAbsReal(prod);
465: g = PetscPowReal(detM,-1.0/(2.0*n));
466: alpha = g;
467: cublasXscal(cublasv2handle,N,&alpha,d_T,one);
468: alpha = g*g;
469: cublasXscal(cublasv2handle,N,&alpha,d_M,one);
470: PetscLogGpuFlops(2.0*n*n*n/3.0+2.0*n*n);
471: }
473: cudaMemcpy(d_Told,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
474: cudaMemcpy(d_invM,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);
476: magma_xgetrf_gpu,n,n,d_invM,ld,piv;
477: magma_xgetri_gpu,n,d_invM,ld,piv,d_work,lwork;
478: PetscLogGpuFlops(2.0*n*n*n/3.0+4.0*n*n*n/3.0);
480: shift_diagonal(n,d_invM,ld,sone);
481: cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&spfive,d_Told,ld,d_invM,ld,&szero,d_T,ld);
482: shift_diagonal(n,d_invM,ld,smone);
484: cublasXaxpy(cublasv2handle,N,&sone,d_invM,one,d_M,one);
485: cublasXscal(cublasv2handle,N,&sp25,d_M,one);
486: shift_diagonal(n,d_M,ld,sneg_pfive);
487: PetscLogGpuFlops(2.0*n*n*n+2.0*n*n);
489: cublasXnrm2(cublasv2handle,N,d_M,one,&Mres);
490: shift_diagonal(n,d_M,ld,sone);
492: if (scale) {
493: // reldiff = norm(T - Told,'fro')/norm(T,'fro');
494: cublasXaxpy(cublasv2handle,N,&smone,d_T,one,d_Told,one);
495: cublasXnrm2(cublasv2handle,N,d_Told,one,&fnormdiff);
496: cublasXnrm2(cublasv2handle,N,d_T,one,&fnormT);
497: PetscLogGpuFlops(7.0*n*n);
498: reldiff = fnormdiff/fnormT;
499: PetscInfo(fn,"it: %" PetscInt_FMT " reldiff: %g scale: %g tol*scale: %g\n",it,(double)reldiff,(double)g,(double)tol*g);
500: if (reldiff<1e-2) scale = PETSC_FALSE; /* Switch to no scaling. */
501: }
503: PetscInfo(fn,"it: %" PetscInt_FMT " Mres: %g\n",it,(double)Mres);
504: if (Mres<=tol) converged = PETSC_TRUE;
505: }
508: cudaMemcpy(T,d_T,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);
509: PetscLogGpuTimeEnd();
510: PetscFree(piv);
511: cudaFree(d_work);
512: cudaFree(d_T);
513: cudaFree(d_Told);
514: cudaFree(d_M);
515: cudaFree(d_invM);
516: magma_finalize();
517: PetscFunctionReturn(0);
518: }
519: #endif /* PETSC_HAVE_MAGMA */
521: #endif /* PETSC_HAVE_CUDA */
523: #define ITMAX 5
524: #define SWAP(a,b,t) {t=a;a=b;b=t;}
526: /*
527: Estimate norm(A^m,1) by block 1-norm power method (required workspace is 11*n)
528: */
529: static PetscErrorCode SlepcNormEst1(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
530: {
531: PetscScalar *X,*Y,*Z,*S,*S_old,*aux,val,sone=1.0,szero=0.0;
532: PetscReal est=0.0,est_old,vals[2]={0.0,0.0},*zvals,maxzval[2],raux;
533: PetscBLASInt i,j,t=2,it=0,ind[2],est_j=0,m1;
535: X = work;
536: Y = work + 2*n;
537: Z = work + 4*n;
538: S = work + 6*n;
539: S_old = work + 8*n;
540: zvals = (PetscReal*)(work + 10*n);
542: for (i=0;i<n;i++) { /* X has columns of unit 1-norm */
543: X[i] = 1.0/n;
544: PetscRandomGetValue(rand,&val);
545: if (PetscRealPart(val) < 0.5) X[i+n] = -1.0/n;
546: else X[i+n] = 1.0/n;
547: }
548: for (i=0;i<t*n;i++) S[i] = 0.0;
549: ind[0] = 0; ind[1] = 0;
550: est_old = 0;
551: while (1) {
552: it++;
553: for (j=0;j<m;j++) { /* Y = A^m*X */
554: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&t,&n,&sone,A,&n,X,&n,&szero,Y,&n));
555: if (j<m-1) SWAP(X,Y,aux);
556: }
557: for (j=0;j<t;j++) { /* vals[j] = norm(Y(:,j),1) */
558: vals[j] = 0.0;
559: for (i=0;i<n;i++) vals[j] += PetscAbsScalar(Y[i+j*n]);
560: }
561: if (vals[0]<vals[1]) {
562: SWAP(vals[0],vals[1],raux);
563: m1 = 1;
564: } else m1 = 0;
565: est = vals[0];
566: if (est>est_old || it==2) est_j = ind[m1];
567: if (it>=2 && est<=est_old) {
568: est = est_old;
569: break;
570: }
571: est_old = est;
572: if (it>ITMAX) break;
573: SWAP(S,S_old,aux);
574: for (i=0;i<t*n;i++) { /* S = sign(Y) */
575: S[i] = (PetscRealPart(Y[i]) < 0.0)? -1.0: 1.0;
576: }
577: for (j=0;j<m;j++) { /* Z = (A^T)^m*S */
578: PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&n,&t,&n,&sone,A,&n,S,&n,&szero,Z,&n));
579: if (j<m-1) SWAP(S,Z,aux);
580: }
581: maxzval[0] = -1; maxzval[1] = -1;
582: ind[0] = 0; ind[1] = 0;
583: for (i=0;i<n;i++) { /* zvals[i] = norm(Z(i,:),inf) */
584: zvals[i] = PetscMax(PetscAbsScalar(Z[i+0*n]),PetscAbsScalar(Z[i+1*n]));
585: if (zvals[i]>maxzval[0]) {
586: maxzval[0] = zvals[i];
587: ind[0] = i;
588: } else if (zvals[i]>maxzval[1]) {
589: maxzval[1] = zvals[i];
590: ind[1] = i;
591: }
592: }
593: if (it>=2 && maxzval[0]==zvals[est_j]) break;
594: for (i=0;i<t*n;i++) X[i] = 0.0;
595: for (j=0;j<t;j++) X[ind[j]+j*n] = 1.0;
596: }
597: *nrm = est;
598: /* Flop count is roughly (it * 2*m * t*gemv) = 4*its*m*t*n*n */
599: PetscLogFlops(4.0*it*m*t*n*n);
600: PetscFunctionReturn(0);
601: }
603: #define SMALLN 100
605: /*
606: Estimate norm(A^m,1) (required workspace is 2*n*n)
607: */
608: PetscErrorCode SlepcNormAm(PetscBLASInt n,PetscScalar *A,PetscInt m,PetscScalar *work,PetscRandom rand,PetscReal *nrm)
609: {
610: PetscScalar *v=work,*w=work+n*n,*aux,sone=1.0,szero=0.0;
611: PetscReal rwork[1],tmp;
612: PetscBLASInt i,j,one=1;
613: PetscBool isrealpos=PETSC_TRUE;
615: if (n<SMALLN) { /* compute matrix power explicitly */
616: if (m==1) {
617: *nrm = LAPACKlange_("O",&n,&n,A,&n,rwork);
618: PetscLogFlops(1.0*n*n);
619: } else { /* m>=2 */
620: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,A,&n,&szero,v,&n));
621: for (j=0;j<m-2;j++) {
622: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,A,&n,v,&n,&szero,w,&n));
623: SWAP(v,w,aux);
624: }
625: *nrm = LAPACKlange_("O",&n,&n,v,&n,rwork);
626: PetscLogFlops(2.0*n*n*n*(m-1)+1.0*n*n);
627: }
628: } else {
629: for (i=0;i<n;i++)
630: for (j=0;j<n;j++)
631: #if defined(PETSC_USE_COMPLEX)
632: if (PetscRealPart(A[i+j*n])<0.0 || PetscImaginaryPart(A[i+j*n])!=0.0) { isrealpos = PETSC_FALSE; break; }
633: #else
634: if (A[i+j*n]<0.0) { isrealpos = PETSC_FALSE; break; }
635: #endif
636: if (isrealpos) { /* for positive matrices only */
637: for (i=0;i<n;i++) v[i] = 1.0;
638: for (j=0;j<m;j++) { /* w = A'*v */
639: PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&n,&sone,A,&n,v,&one,&szero,w,&one));
640: SWAP(v,w,aux);
641: }
642: PetscLogFlops(2.0*n*n*m);
643: *nrm = 0.0;
644: for (i=0;i<n;i++) if ((tmp = PetscAbsScalar(v[i])) > *nrm) *nrm = tmp; /* norm(v,inf) */
645: } else SlepcNormEst1(n,A,m,work,rand,nrm);
646: }
647: PetscFunctionReturn(0);
648: }