Actual source code: fnsqrt.c
slepc-3.16.0 2021-09-30
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-2021, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: Square root function sqrt(x)
12: */
14: #include <slepc/private/fnimpl.h>
15: #include <slepcblaslapack.h>
17: PetscErrorCode FNEvaluateFunction_Sqrt(FN fn,PetscScalar x,PetscScalar *y)
18: {
20: #if !defined(PETSC_USE_COMPLEX)
21: if (x<0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Function not defined in the requested value");
22: #endif
23: *y = PetscSqrtScalar(x);
24: return(0);
25: }
27: PetscErrorCode FNEvaluateDerivative_Sqrt(FN fn,PetscScalar x,PetscScalar *y)
28: {
30: if (x==0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Derivative not defined in the requested value");
31: #if !defined(PETSC_USE_COMPLEX)
32: if (x<0.0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_OUTOFRANGE,"Derivative not defined in the requested value");
33: #endif
34: *y = 1.0/(2.0*PetscSqrtScalar(x));
35: return(0);
36: }
38: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Schur(FN fn,Mat A,Mat B)
39: {
41: PetscBLASInt n=0;
42: PetscScalar *T;
43: PetscInt m;
46: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
47: MatDenseGetArray(B,&T);
48: MatGetSize(A,&m,NULL);
49: PetscBLASIntCast(m,&n);
50: FNSqrtmSchur(fn,n,T,n,PETSC_FALSE);
51: MatDenseRestoreArray(B,&T);
52: return(0);
53: }
55: PetscErrorCode FNEvaluateFunctionMatVec_Sqrt_Schur(FN fn,Mat A,Vec v)
56: {
58: PetscBLASInt n=0;
59: PetscScalar *T;
60: PetscInt m;
61: Mat B;
64: FN_AllocateWorkMat(fn,A,&B);
65: MatDenseGetArray(B,&T);
66: MatGetSize(A,&m,NULL);
67: PetscBLASIntCast(m,&n);
68: FNSqrtmSchur(fn,n,T,n,PETSC_TRUE);
69: MatDenseRestoreArray(B,&T);
70: MatGetColumnVector(B,v,0);
71: FN_FreeWorkMat(fn,&B);
72: return(0);
73: }
75: PetscErrorCode FNEvaluateFunctionMat_Sqrt_DBP(FN fn,Mat A,Mat B)
76: {
78: PetscBLASInt n=0;
79: PetscScalar *T;
80: PetscInt m;
83: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
84: MatDenseGetArray(B,&T);
85: MatGetSize(A,&m,NULL);
86: PetscBLASIntCast(m,&n);
87: FNSqrtmDenmanBeavers(fn,n,T,n,PETSC_FALSE);
88: MatDenseRestoreArray(B,&T);
89: return(0);
90: }
92: PetscErrorCode FNEvaluateFunctionMat_Sqrt_NS(FN fn,Mat A,Mat B)
93: {
95: PetscBLASInt n=0;
96: PetscScalar *Ba;
97: PetscInt m;
100: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
101: MatDenseGetArray(B,&Ba);
102: MatGetSize(A,&m,NULL);
103: PetscBLASIntCast(m,&n);
104: FNSqrtmNewtonSchulz(fn,n,Ba,n,PETSC_FALSE);
105: MatDenseRestoreArray(B,&Ba);
106: return(0);
107: }
109: #define MAXIT 50
111: /*
112: Computes the principal square root of the matrix A using the
113: Sadeghi iteration. A is overwritten with sqrtm(A).
114: */
115: PetscErrorCode FNSqrtmSadeghi(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld)
116: {
117: PetscScalar *M,*M2,*G,*X=A,*work,work1,sqrtnrm;
118: PetscScalar szero=0.0,sone=1.0,smfive=-5.0,s1d16=1.0/16.0;
119: PetscReal tol,Mres=0.0,nrm,rwork[1],done=1.0;
120: PetscBLASInt N,i,it,*piv=NULL,info,lwork=0,query=-1,one=1,zero=0;
121: PetscBool converged=PETSC_FALSE;
123: unsigned int ftz;
126: N = n*n;
127: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
128: SlepcSetFlushToZero(&ftz);
130: /* query work size */
131: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,A,&ld,piv,&work1,&query,&info));
132: PetscBLASIntCast((PetscInt)PetscRealPart(work1),&lwork);
134: PetscMalloc5(N,&M,N,&M2,N,&G,lwork,&work,n,&piv);
135: PetscArraycpy(M,A,N);
137: /* scale M */
138: nrm = LAPACKlange_("fro",&n,&n,M,&n,rwork);
139: if (nrm>1.0) {
140: sqrtnrm = PetscSqrtReal(nrm);
141: PetscStackCallBLAS("LAPACKlascl",LAPACKlascl_("G",&zero,&zero,&nrm,&done,&N,&one,M,&N,&info));
142: SlepcCheckLapackInfo("lascl",info);
143: tol *= nrm;
144: }
145: PetscInfo2(fn,"||A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
147: /* X = I */
148: PetscArrayzero(X,N);
149: for (i=0;i<n;i++) X[i+i*ld] = 1.0;
151: for (it=0;it<MAXIT && !converged;it++) {
153: /* G = (5/16)*I + (1/16)*M*(15*I-5*M+M*M) */
154: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,M,&ld,M,&ld,&szero,M2,&ld));
155: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&N,&smfive,M,&one,M2,&one));
156: for (i=0;i<n;i++) M2[i+i*ld] += 15.0;
157: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&s1d16,M,&ld,M2,&ld,&szero,G,&ld));
158: for (i=0;i<n;i++) G[i+i*ld] += 5.0/16.0;
160: /* X = X*G */
161: PetscArraycpy(M2,X,N);
162: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,M2,&ld,G,&ld,&szero,X,&ld));
164: /* M = M*inv(G*G) */
165: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,G,&ld,G,&ld,&szero,M2,&ld));
166: PetscStackCallBLAS("LAPACKgetrf",LAPACKgetrf_(&n,&n,M2,&ld,piv,&info));
167: SlepcCheckLapackInfo("getrf",info);
168: PetscStackCallBLAS("LAPACKgetri",LAPACKgetri_(&n,M2,&ld,piv,work,&lwork,&info));
169: SlepcCheckLapackInfo("getri",info);
171: PetscArraycpy(G,M,N);
172: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&n,&n,&n,&sone,G,&ld,M2,&ld,&szero,M,&ld));
174: /* check ||I-M|| */
175: PetscArraycpy(M2,M,N);
176: for (i=0;i<n;i++) M2[i+i*ld] -= 1.0;
177: Mres = LAPACKlange_("fro",&n,&n,M2,&n,rwork);
178: if (PetscIsNanReal(Mres)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_FP,"The computed norm is not-a-number");
179: if (Mres<=tol) converged = PETSC_TRUE;
180: PetscInfo2(fn,"it: %D res: %g\n",it,(double)Mres);
181: PetscLogFlops(8.0*n*n*n+2.0*n*n+2.0*n*n*n/3.0+4.0*n*n*n/3.0+2.0*n*n*n+2.0*n*n);
182: }
184: if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations",MAXIT);
186: /* undo scaling */
187: if (nrm>1.0) PetscStackCallBLAS("BLASscal",BLASscal_(&N,&sqrtnrm,A,&one));
189: PetscFree5(M,M2,G,work,piv);
190: SlepcResetFlushToZero(&ftz);
191: return(0);
192: }
194: #if defined(PETSC_HAVE_CUDA)
195: #include "../src/sys/classes/fn/impls/cuda/fnutilcuda.h"
196: #include <slepccublas.h>
198: #if defined(PETSC_HAVE_MAGMA)
199: #include <slepcmagma.h>
201: /*
202: * Matrix square root by Sadeghi iteration. CUDA version.
203: * Computes the principal square root of the matrix T using the
204: * Sadeghi iteration. T is overwritten with sqrtm(T).
205: */
206: PetscErrorCode FNSqrtmSadeghi_CUDAm(FN fn,PetscBLASInt n,PetscScalar *A,PetscBLASInt ld)
207: {
208: PetscScalar *d_X,*d_M,*d_M2,*d_G,*d_work,alpha;
209: const PetscScalar szero=0.0,sone=1.0,smfive=-5.0,s15=15.0,s1d16=1.0/16.0;
210: PetscReal tol,Mres=0.0,nrm,sqrtnrm;
211: PetscInt it,nb,lwork;
212: PetscBLASInt info,*piv,N;
213: const PetscBLASInt one=1,zero=0;
214: PetscBool converged=PETSC_FALSE;
215: cublasHandle_t cublasv2handle;
216: PetscErrorCode ierr;
217: cublasStatus_t cberr;
218: cudaError_t cerr;
219: magma_int_t mierr;
222: PetscCUDAInitializeCheck(); /* For CUDA event timers */
223: PetscCUBLASGetHandle(&cublasv2handle);
224: magma_init();
225: N = n*n;
226: tol = PetscSqrtReal((PetscReal)n)*PETSC_MACHINE_EPSILON/2;
228: PetscMalloc1(n,&piv);
229: cerr = cudaMalloc((void **)&d_X,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
230: cerr = cudaMalloc((void **)&d_M,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
231: cerr = cudaMalloc((void **)&d_M2,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
232: cerr = cudaMalloc((void **)&d_G,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
234: nb = magma_get_xgetri_nb(n);
235: lwork = nb*n;
236: cerr = cudaMalloc((void **)&d_work,sizeof(PetscScalar)*lwork);CHKERRCUDA(cerr);
237: PetscLogGpuTimeBegin();
239: /* M = A */
240: cerr = cudaMemcpy(d_M,A,sizeof(PetscScalar)*N,cudaMemcpyHostToDevice);CHKERRCUDA(cerr);
242: /* scale M */
243: cberr = cublasXnrm2(cublasv2handle,N,d_M,one,&nrm);CHKERRCUBLAS(cberr);
244: if (nrm>1.0) {
245: sqrtnrm = PetscSqrtReal(nrm);
246: alpha = 1.0/nrm;
247: cberr = cublasXscal(cublasv2handle,N,&alpha,d_M,one);CHKERRCUBLAS(cberr);
248: tol *= nrm;
249: }
250: PetscInfo2(fn,"||A||_F = %g, new tol: %g\n",(double)nrm,(double)tol);
252: /* X = I */
253: cerr = cudaMemset(d_X,zero,sizeof(PetscScalar)*N);CHKERRCUDA(cerr);
254: set_diagonal(n,d_X,ld,sone);CHKERRQ(cerr);
256: for (it=0;it<MAXIT && !converged;it++) {
258: /* G = (5/16)*I + (1/16)*M*(15*I-5*M+M*M) */
259: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_M,ld,d_M,ld,&szero,d_M2,ld);CHKERRCUBLAS(cberr);
260: cberr = cublasXaxpy(cublasv2handle,N,&smfive,d_M,one,d_M2,one);CHKERRCUBLAS(cberr);
261: shift_diagonal(n,d_M2,ld,s15);CHKERRQ(cerr);
262: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&s1d16,d_M,ld,d_M2,ld,&szero,d_G,ld);CHKERRCUBLAS(cberr);
263: shift_diagonal(n,d_G,ld,5.0/16.0);CHKERRQ(cerr);
265: /* X = X*G */
266: cerr = cudaMemcpy(d_M2,d_X,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
267: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_M2,ld,d_G,ld,&szero,d_X,ld);CHKERRCUBLAS(cberr);
269: /* M = M*inv(G*G) */
270: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_G,ld,d_G,ld,&szero,d_M2,ld);CHKERRCUBLAS(cberr);
271: /* magma */
272: mmagma_xgetrf_gpu(n,n,d_M2,ld,piv,&info);CHKERRMAGMA(mierr);
273: if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetrf: Illegal value on argument %d",PetscAbsInt(info));
274: if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetrf: Matrix is singular. U(%d,%d) is zero",info,info);
275: mmagma_xgetri_gpu(n,d_M2,ld,piv,d_work,lwork,&info);CHKERRMAGMA(mierr);
276: if (info < 0) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"LAPACKgetri: Illegal value on argument %d",PetscAbsInt(info));
277: if (info > 0) SETERRQ2(PETSC_COMM_SELF,PETSC_ERR_MAT_LU_ZRPVT,"LAPACKgetri: Matrix is singular. U(%d,%d) is zero",info,info);
278: /* magma */
279: cerr = cudaMemcpy(d_G,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
280: cberr = cublasXgemm(cublasv2handle,CUBLAS_OP_N,CUBLAS_OP_N,n,n,n,&sone,d_G,ld,d_M2,ld,&szero,d_M,ld);CHKERRCUBLAS(cberr);
282: /* check ||I-M|| */
283: cerr = cudaMemcpy(d_M2,d_M,sizeof(PetscScalar)*N,cudaMemcpyDeviceToDevice);CHKERRCUDA(cerr);
284: shift_diagonal(n,d_M2,ld,-1.0);CHKERRQ(cerr);
285: cberr = cublasXnrm2(cublasv2handle,N,d_M2,one,&Mres);CHKERRCUBLAS(cberr);
286: if (PetscIsNanReal(Mres)) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_FP,"The computed norm is not-a-number");
287: if (Mres<=tol) converged = PETSC_TRUE;
288: PetscInfo2(fn,"it: %D res: %g\n",it,(double)Mres);
289: PetscLogGpuFlops(8.0*n*n*n+2.0*n*n+2.0*n*n*n/3.0+4.0*n*n*n/3.0+2.0*n*n*n+2.0*n*n);
290: }
292: if (Mres>tol) SETERRQ1(PETSC_COMM_SELF,PETSC_ERR_LIB,"SQRTM not converged after %d iterations", MAXIT);
294: if (nrm>1.0) {cberr = cublasXscal(cublasv2handle,N,&sqrtnrm,d_X,one);CHKERRCUBLAS(cberr);}
295: cerr = cudaMemcpy(A,d_X,sizeof(PetscScalar)*N,cudaMemcpyDeviceToHost);CHKERRCUDA(cerr);
296: PetscLogGpuTimeEnd();
298: cerr = cudaFree(d_X);CHKERRCUDA(cerr);
299: cerr = cudaFree(d_M);CHKERRCUDA(cerr);
300: cerr = cudaFree(d_M2);CHKERRCUDA(cerr);
301: cerr = cudaFree(d_G);CHKERRCUDA(cerr);
302: cerr = cudaFree(d_work);CHKERRCUDA(cerr);
303: PetscFree(piv);
305: magma_finalize();
306: return(0);
307: }
308: #endif /* PETSC_HAVE_MAGMA */
309: #endif /* PETSC_HAVE_CUDA */
311: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Sadeghi(FN fn,Mat A,Mat B)
312: {
314: PetscBLASInt n=0;
315: PetscScalar *Ba;
316: PetscInt m;
319: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
320: MatDenseGetArray(B,&Ba);
321: MatGetSize(A,&m,NULL);
322: PetscBLASIntCast(m,&n);
323: FNSqrtmSadeghi(fn,n,Ba,n);
324: MatDenseRestoreArray(B,&Ba);
325: return(0);
326: }
328: #if defined(PETSC_HAVE_CUDA)
329: PetscErrorCode FNEvaluateFunctionMat_Sqrt_NS_CUDA(FN fn,Mat A,Mat B)
330: {
332: PetscBLASInt n=0;
333: PetscScalar *Ba;
334: PetscInt m;
337: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
338: MatDenseGetArray(B,&Ba);
339: MatGetSize(A,&m,NULL);
340: PetscBLASIntCast(m,&n);
341: FNSqrtmNewtonSchulz_CUDA(fn,n,Ba,n,PETSC_FALSE);
342: MatDenseRestoreArray(B,&Ba);
343: return(0);
344: }
346: #if defined(PETSC_HAVE_MAGMA)
347: PetscErrorCode FNEvaluateFunctionMat_Sqrt_DBP_CUDAm(FN fn,Mat A,Mat B)
348: {
350: PetscBLASInt n=0;
351: PetscScalar *T;
352: PetscInt m;
355: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
356: MatDenseGetArray(B,&T);
357: MatGetSize(A,&m,NULL);
358: PetscBLASIntCast(m,&n);
359: FNSqrtmDenmanBeavers_CUDAm(fn,n,T,n,PETSC_FALSE);
360: MatDenseRestoreArray(B,&T);
361: return(0);
362: }
364: PetscErrorCode FNEvaluateFunctionMat_Sqrt_Sadeghi_CUDAm(FN fn,Mat A,Mat B)
365: {
367: PetscBLASInt n=0;
368: PetscScalar *Ba;
369: PetscInt m;
372: if (A!=B) { MatCopy(A,B,SAME_NONZERO_PATTERN); }
373: MatDenseGetArray(B,&Ba);
374: MatGetSize(A,&m,NULL);
375: PetscBLASIntCast(m,&n);
376: FNSqrtmSadeghi_CUDAm(fn,n,Ba,n);
377: MatDenseRestoreArray(B,&Ba);
378: return(0);
379: }
380: #endif /* PETSC_HAVE_MAGMA */
381: #endif /* PETSC_HAVE_CUDA */
383: PetscErrorCode FNView_Sqrt(FN fn,PetscViewer viewer)
384: {
386: PetscBool isascii;
387: char str[50];
388: const char *methodname[] = {
389: "Schur method for the square root",
390: "Denman-Beavers (product form)",
391: "Newton-Schulz iteration",
392: "Sadeghi iteration"
393: #if defined(PETSC_HAVE_CUDA)
394: ,"Newton-Schulz iteration CUDA"
395: #if defined(PETSC_HAVE_MAGMA)
396: ,"Denman-Beavers (product form) CUDA/MAGMA",
397: "Sadeghi iteration CUDA/MAGMA"
398: #endif
399: #endif
400: };
401: const int nmeth=sizeof(methodname)/sizeof(methodname[0]);
404: PetscObjectTypeCompare((PetscObject)viewer,PETSCVIEWERASCII,&isascii);
405: if (isascii) {
406: if (fn->beta==(PetscScalar)1.0) {
407: if (fn->alpha==(PetscScalar)1.0) {
408: PetscViewerASCIIPrintf(viewer," Square root: sqrt(x)\n");
409: } else {
410: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
411: PetscViewerASCIIPrintf(viewer," Square root: sqrt(%s*x)\n",str);
412: }
413: } else {
414: SlepcSNPrintfScalar(str,sizeof(str),fn->beta,PETSC_TRUE);
415: if (fn->alpha==(PetscScalar)1.0) {
416: PetscViewerASCIIPrintf(viewer," Square root: %s*sqrt(x)\n",str);
417: } else {
418: PetscViewerASCIIPrintf(viewer," Square root: %s",str);
419: PetscViewerASCIIUseTabs(viewer,PETSC_FALSE);
420: SlepcSNPrintfScalar(str,sizeof(str),fn->alpha,PETSC_TRUE);
421: PetscViewerASCIIPrintf(viewer,"*sqrt(%s*x)\n",str);
422: PetscViewerASCIIUseTabs(viewer,PETSC_TRUE);
423: }
424: }
425: if (fn->method<nmeth) {
426: PetscViewerASCIIPrintf(viewer," computing matrix functions with: %s\n",methodname[fn->method]);
427: }
428: }
429: return(0);
430: }
432: SLEPC_EXTERN PetscErrorCode FNCreate_Sqrt(FN fn)
433: {
435: fn->ops->evaluatefunction = FNEvaluateFunction_Sqrt;
436: fn->ops->evaluatederivative = FNEvaluateDerivative_Sqrt;
437: fn->ops->evaluatefunctionmat[0] = FNEvaluateFunctionMat_Sqrt_Schur;
438: fn->ops->evaluatefunctionmat[1] = FNEvaluateFunctionMat_Sqrt_DBP;
439: fn->ops->evaluatefunctionmat[2] = FNEvaluateFunctionMat_Sqrt_NS;
440: fn->ops->evaluatefunctionmat[3] = FNEvaluateFunctionMat_Sqrt_Sadeghi;
441: #if defined(PETSC_HAVE_CUDA)
442: fn->ops->evaluatefunctionmat[4] = FNEvaluateFunctionMat_Sqrt_NS_CUDA;
443: #if defined(PETSC_HAVE_MAGMA)
444: fn->ops->evaluatefunctionmat[5] = FNEvaluateFunctionMat_Sqrt_DBP_CUDAm;
445: fn->ops->evaluatefunctionmat[6] = FNEvaluateFunctionMat_Sqrt_Sadeghi_CUDAm;
446: #endif /* PETSC_HAVE_MAGMA */
447: #endif /* PETSC_HAVE_CUDA */
448: fn->ops->evaluatefunctionmatvec[0] = FNEvaluateFunctionMatVec_Sqrt_Schur;
449: fn->ops->view = FNView_Sqrt;
450: return(0);
451: }