Actual source code: bvblas.c
slepc-3.17.1 2022-04-11
1: /*
2: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
3: SLEPc - Scalable Library for Eigenvalue Problem Computations
4: Copyright (c) 2002-, Universitat Politecnica de Valencia, Spain
6: This file is part of SLEPc.
7: SLEPc is distributed under a 2-clause BSD license (see LICENSE).
8: - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
9: */
10: /*
11: BV private kernels that use the BLAS
12: */
14: #include <slepc/private/bvimpl.h>
15: #include <slepcblaslapack.h>
17: #define BLOCKSIZE 64
19: /*
20: C := alpha*A*B + beta*C
22: A is mxk (ld=m), B is kxn (ld=ldb), C is mxn (ld=m)
23: */
24: PetscErrorCode BVMult_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldb_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *B,PetscScalar beta,PetscScalar *C)
25: {
26: PetscBLASInt m,n,k,ldb;
27: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
28: PetscBLASInt l,bs=BLOCKSIZE;
29: #endif
31: PetscBLASIntCast(m_,&m);
32: PetscBLASIntCast(n_,&n);
33: PetscBLASIntCast(k_,&k);
34: PetscBLASIntCast(ldb_,&ldb);
35: #if defined(PETSC_HAVE_FBLASLAPACK) || defined(PETSC_HAVE_F2CBLASLAPACK)
36: l = m % bs;
37: if (l) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&l,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
38: for (;l<m;l+=bs) {
39: PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&bs,&n,&k,&alpha,(PetscScalar*)A+l,&m,(PetscScalar*)B,&ldb,&beta,C+l,&m));
40: }
41: #else
42: if (m) PetscStackCallBLAS("BLASgemm",BLASgemm_("N","N",&m,&n,&k,&alpha,(PetscScalar*)A,&m,(PetscScalar*)B,&ldb,&beta,C,&m));
43: #endif
44: PetscLogFlops(2.0*m*n*k);
45: PetscFunctionReturn(0);
46: }
48: /*
49: y := alpha*A*x + beta*y
51: A is nxk (ld=n)
52: */
53: PetscErrorCode BVMultVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,const PetscScalar *x,PetscScalar beta,PetscScalar *y)
54: {
55: PetscBLASInt n,k,one=1;
57: PetscBLASIntCast(n_,&n);
58: PetscBLASIntCast(k_,&k);
59: if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("N",&n,&k,&alpha,A,&n,x,&one,&beta,y,&one));
60: PetscLogFlops(2.0*n*k);
61: PetscFunctionReturn(0);
62: }
64: /*
65: A(:,s:e-1) := A*B(:,s:e-1)
67: A is mxk (ld=m), B is kxn (ld=ldb) n=e-s
68: */
69: PetscErrorCode BVMultInPlace_BLAS_Private(BV bv,PetscInt m_,PetscInt k_,PetscInt ldb_,PetscInt s,PetscInt e,PetscScalar *A,const PetscScalar *B,PetscBool btrans)
70: {
71: PetscScalar *pb,zero=0.0,one=1.0;
72: PetscBLASInt m,n,k,l,ldb,bs=BLOCKSIZE;
73: PetscInt j,n_=e-s;
74: const char *bt;
76: PetscBLASIntCast(m_,&m);
77: PetscBLASIntCast(n_,&n);
78: PetscBLASIntCast(k_,&k);
79: PetscBLASIntCast(ldb_,&ldb);
80: BVAllocateWork_Private(bv,BLOCKSIZE*n_);
81: if (PetscUnlikely(btrans)) {
82: pb = (PetscScalar*)B+s;
83: bt = "C";
84: } else {
85: pb = (PetscScalar*)B+s*ldb;
86: bt = "N";
87: }
88: l = m % bs;
89: if (l) {
90: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&k,&one,A,&m,pb,&ldb,&zero,bv->work,&l));
91: for (j=0;j<n;j++) PetscArraycpy(A+(s+j)*m,bv->work+j*l,l);
92: }
93: for (;l<m;l+=bs) {
94: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&k,&one,A+l,&m,pb,&ldb,&zero,bv->work,&bs));
95: for (j=0;j<n;j++) PetscArraycpy(A+(s+j)*m+l,bv->work+j*bs,bs);
96: }
97: PetscLogFlops(2.0*m*n*k);
98: PetscFunctionReturn(0);
99: }
101: /*
102: V := V*B
104: V is mxn (ld=m), B is nxn (ld=k)
105: */
106: PetscErrorCode BVMultInPlace_Vecs_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,Vec *V,const PetscScalar *B,PetscBool btrans)
107: {
108: PetscScalar zero=0.0,one=1.0,*out,*pout;
109: const PetscScalar *pin;
110: PetscBLASInt m = 0,n,k,l,bs=BLOCKSIZE;
111: PetscInt j;
112: const char *bt;
114: PetscBLASIntCast(m_,&m);
115: PetscBLASIntCast(n_,&n);
116: PetscBLASIntCast(k_,&k);
117: BVAllocateWork_Private(bv,2*BLOCKSIZE*n_);
118: out = bv->work+BLOCKSIZE*n_;
119: if (btrans) bt = "C";
120: else bt = "N";
121: l = m % bs;
122: if (l) {
123: for (j=0;j<n;j++) {
124: VecGetArrayRead(V[j],&pin);
125: PetscArraycpy(bv->work+j*l,pin,l);
126: VecRestoreArrayRead(V[j],&pin);
127: }
128: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&l,&n,&n,&one,bv->work,&l,(PetscScalar*)B,&k,&zero,out,&l));
129: for (j=0;j<n;j++) {
130: VecGetArray(V[j],&pout);
131: PetscArraycpy(pout,out+j*l,l);
132: VecRestoreArray(V[j],&pout);
133: }
134: }
135: for (;l<m;l+=bs) {
136: for (j=0;j<n;j++) {
137: VecGetArrayRead(V[j],&pin);
138: PetscArraycpy(bv->work+j*bs,pin+l,bs);
139: VecRestoreArrayRead(V[j],&pin);
140: }
141: PetscStackCallBLAS("BLASgemm",BLASgemm_("N",bt,&bs,&n,&n,&one,bv->work,&bs,(PetscScalar*)B,&k,&zero,out,&bs));
142: for (j=0;j<n;j++) {
143: VecGetArray(V[j],&pout);
144: PetscArraycpy(pout+l,out+j*bs,bs);
145: VecRestoreArray(V[j],&pout);
146: }
147: }
148: PetscLogFlops(2.0*n*n*k);
149: PetscFunctionReturn(0);
150: }
152: /*
153: B := alpha*A + beta*B
155: A,B are nxk (ld=n)
156: */
157: PetscErrorCode BVAXPY_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,PetscScalar alpha,const PetscScalar *A,PetscScalar beta,PetscScalar *B)
158: {
159: PetscBLASInt m,one=1;
161: PetscBLASIntCast(n_*k_,&m);
162: if (beta!=(PetscScalar)1.0) {
163: PetscStackCallBLAS("BLASscal",BLASscal_(&m,&beta,B,&one));
164: PetscLogFlops(m);
165: }
166: PetscStackCallBLAS("BLASaxpy",BLASaxpy_(&m,&alpha,A,&one,B,&one));
167: PetscLogFlops(2.0*m);
168: PetscFunctionReturn(0);
169: }
171: /*
172: C := A'*B
174: A' is mxk (ld=k), B is kxn (ld=k), C is mxn (ld=ldc)
175: */
176: PetscErrorCode BVDot_BLAS_Private(BV bv,PetscInt m_,PetscInt n_,PetscInt k_,PetscInt ldc_,const PetscScalar *A,const PetscScalar *B,PetscScalar *C,PetscBool mpi)
177: {
178: PetscScalar zero=0.0,one=1.0,*CC;
179: PetscBLASInt m,n,k,ldc,j;
180: PetscMPIInt len;
182: PetscBLASIntCast(m_,&m);
183: PetscBLASIntCast(n_,&n);
184: PetscBLASIntCast(k_,&k);
185: PetscBLASIntCast(ldc_,&ldc);
186: if (mpi) {
187: if (ldc==m) {
188: BVAllocateWork_Private(bv,m*n);
189: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&ldc));
190: else PetscArrayzero(bv->work,m*n);
191: PetscMPIIntCast(m*n,&len);
192: MPIU_Allreduce(bv->work,C,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
193: } else {
194: BVAllocateWork_Private(bv,2*m*n);
195: CC = bv->work+m*n;
196: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,bv->work,&m));
197: else PetscArrayzero(bv->work,m*n);
198: PetscMPIIntCast(m*n,&len);
199: MPIU_Allreduce(bv->work,CC,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
200: for (j=0;j<n;j++) PetscArraycpy(C+j*ldc,CC+j*m,m);
201: }
202: } else {
203: if (k) PetscStackCallBLAS("BLASgemm",BLASgemm_("C","N",&m,&n,&k,&one,(PetscScalar*)A,&k,(PetscScalar*)B,&k,&zero,C,&ldc));
204: }
205: PetscLogFlops(2.0*m*n*k);
206: PetscFunctionReturn(0);
207: }
209: /*
210: y := A'*x
212: A is nxk (ld=n)
213: */
214: PetscErrorCode BVDotVec_BLAS_Private(BV bv,PetscInt n_,PetscInt k_,const PetscScalar *A,const PetscScalar *x,PetscScalar *y,PetscBool mpi)
215: {
216: PetscScalar zero=0.0,done=1.0;
217: PetscBLASInt n,k,one=1;
218: PetscMPIInt len;
220: PetscBLASIntCast(n_,&n);
221: PetscBLASIntCast(k_,&k);
222: if (mpi) {
223: BVAllocateWork_Private(bv,k);
224: if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,bv->work,&one));
225: else PetscArrayzero(bv->work,k);
226: PetscMPIIntCast(k,&len);
227: MPIU_Allreduce(bv->work,y,len,MPIU_SCALAR,MPIU_SUM,PetscObjectComm((PetscObject)bv));
228: } else {
229: if (n) PetscStackCallBLAS("BLASgemv",BLASgemv_("C",&n,&k,&done,A,&n,x,&one,&zero,y,&one));
230: }
231: PetscLogFlops(2.0*n*k);
232: PetscFunctionReturn(0);
233: }
235: /*
236: Scale n scalars
237: */
238: PetscErrorCode BVScale_BLAS_Private(BV bv,PetscInt n_,PetscScalar *A,PetscScalar alpha)
239: {
240: PetscBLASInt n,one=1;
242: if (PetscUnlikely(alpha == (PetscScalar)0.0)) PetscArrayzero(A,n_);
243: else if (alpha!=(PetscScalar)1.0) {
244: PetscBLASIntCast(n_,&n);
245: PetscStackCallBLAS("BLASscal",BLASscal_(&n,&alpha,A,&one));
246: PetscLogFlops(n);
247: }
248: PetscFunctionReturn(0);
249: }