Grid 0.7.0
MultiRHSBlockCGLinalg.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: MultiRHSBlockCGLinalg.h
6
7 Copyright (C) 2024
8
9Author: Peter Boyle <pboyle@bnl.gov>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#pragma once
29
31
32
33/* Need helper object for BLAS accelerated mrhs blockCG */
34template<class Field>
36{
37public:
38
39 typedef typename Field::scalar_type scalar;
40 typedef typename Field::scalar_object scalar_object;
41 typedef typename Field::vector_object vector_object;
42
43 deviceVector<scalar> BLAS_X; // nrhs x vol -- the sources
44 deviceVector<scalar> BLAS_Y; // nrhs x vol -- the result
45 deviceVector<scalar> BLAS_C; // nrhs x nrhs -- the coefficients
46 deviceVector<scalar> BLAS_Cred; // nrhs x nrhs x oSites -- reduction buffer
50
53
54 void Deallocate(void)
55 {
56 Xdip.resize(0);
57 Ydip.resize(0);
58 Cdip.resize(0);
59 BLAS_Cred.resize(0);
60 BLAS_C.resize(0);
61 BLAS_X.resize(0);
62 BLAS_Y.resize(0);
63 }
64 void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m , const std::vector<Field> &X,const std::vector<Field> &Y,RealD scale=1.0)
65 {
66 std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
67 for(int r=0;r<AP.size();r++){
68 Y_copy[r] = Y[r];
69 }
70 MulMatrix(AP,m,X);
71 for(int r=0;r<AP.size();r++){
72 AP[r] = scale*AP[r]+Y_copy[r];
73 }
74 }
75 void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m , const std::vector<Field> &X)
76 {
77 typedef typename Field::scalar_type scomplex;
78 GridBase *grid;
79 uint64_t vol;
80 uint64_t words;
81
82 int nrhs = Y.size();
83 grid = X[0].Grid();
84 vol = grid->lSites();
85 words = sizeof(scalar_object)/sizeof(scalar);
86 int64_t vw = vol * words;
87
88 RealD t0 = usecond();
89 BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
90 BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
91 BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
92 RealD t1 = usecond();
93
95 // Copy in the multi-rhs sources
97 for(int r=0;r<nrhs;r++){
98 int64_t offset = r*vw;
99 autoView(x_v,X[r],AcceleratorRead);
100 acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
101 }
102
103 // Assumes Eigen storage contiguous
104 acceleratorCopyToDevice(&m(0,0),&BLAS_C[0],BLAS_C.size()*sizeof(scalar));
105
106 /*
107 * in Fortran column major notation (cuBlas order)
108 *
109 * Xxr = [X1(x)][..][Xn(x)]
110 * Yxr = [Y1(x)][..][Ym(x)]
111 * Y = X . C
112 */
116
117 scalar * Xh = & BLAS_X[0];
118 scalar * Yh = & BLAS_Y[0];
119 scalar * Ch = & BLAS_C[0];
120
121 acceleratorPut(Xd[0],Xh);
122 acceleratorPut(Yd[0],Yh);
123 acceleratorPut(Cd[0],Ch);
124
125 RealD t2 = usecond();
126 GridBLAS BLAS;
128 // Y = X*C (transpose?)
131 vw,nrhs,nrhs,
132 scalar(1.0),
133 Xd,
134 Cd,
135 scalar(0.0), // wipe out Y
136 Yd);
137 BLAS.synchronise();
138 RealD t3 = usecond();
139
140 // Copy back Y = m X
141 for(int r=0;r<nrhs;r++){
142 int64_t offset = r*vw;
143 autoView(y_v,Y[r],AcceleratorWrite);
144 acceleratorCopyDeviceToDevice(&BLAS_Y[offset],&y_v[0],sizeof(scalar_object)*vol);
145 }
146 RealD t4 = usecond();
147 std::cout <<GridLogPerformance << "MulMatrix alloc took "<< t1-t0<<" us"<<std::endl;
148 std::cout <<GridLogPerformance<< "MulMatrix preamble took "<< t2-t1<<" us"<<std::endl;
149 std::cout <<GridLogPerformance<< "MulMatrix blas took "<< t3-t2<<" us"<<std::endl;
150 std::cout <<GridLogPerformance<< "MulMatrix copy took "<< t4-t3<<" us"<<std::endl;
151 std::cout <<GridLogPerformance<< "MulMatrix total "<< t4-t0<<" us"<<std::endl;
152 }
153
154 void InnerProductMatrix(Eigen::MatrixXcd &m , const std::vector<Field> &X, const std::vector<Field> &Y)
155 {
156#if 0
157 int nrhs;
158 GridBase *grid;
159 uint64_t vol;
160 uint64_t words;
161
162 nrhs = X.size();
163 assert(X.size()==Y.size());
164 conformable(X[0],Y[0]);
165
166 grid = X[0].Grid();
167 vol = grid->lSites();
168 words = sizeof(scalar_object)/sizeof(scalar);
169 int64_t vw = vol * words;
170
171 RealD t0 = usecond();
172 BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
173 BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
174 BLAS_C.resize(nrhs * nrhs);// cost free if size doesn't change
175 RealD t1 = usecond();
176
178 // Copy in the multi-rhs sources
180 for(int r=0;r<nrhs;r++){
181 int64_t offset = r*vw;
182 autoView(x_v,X[r],AcceleratorRead);
183 acceleratorCopyDeviceToDevice(&x_v[0],&BLAS_X[offset],sizeof(scalar_object)*vol);
184 autoView(y_v,Y[r],AcceleratorRead);
185 acceleratorCopyDeviceToDevice(&y_v[0],&BLAS_Y[offset],sizeof(scalar_object)*vol);
186 }
187 RealD t2 = usecond();
188
189 /*
190 * in Fortran column major notation (cuBlas order)
191 *
192 * Xxr = [X1(x)][..][Xn(x)]
193 *
194 * Yxr = [Y1(x)][..][Ym(x)]
195 *
196 * C_rs = X^dag Y
197 */
201
202 scalar * Xh = & BLAS_X[0];
203 scalar * Yh = & BLAS_Y[0];
204 scalar * Ch = & BLAS_C[0];
205
206 acceleratorPut(Xd[0],Xh);
207 acceleratorPut(Yd[0],Yh);
208 acceleratorPut(Cd[0],Ch);
209
210 GridBLAS BLAS;
211
212 RealD t3 = usecond();
214 // C_rs = X^dag Y
217 nrhs,nrhs,vw,
218 ComplexD(1.0),
219 Xd,
220 Yd,
221 ComplexD(0.0), // wipe out C
222 Cd);
223 BLAS.synchronise();
224 RealD t4 = usecond();
225
226 std::vector<scalar> HOST_C(BLAS_C.size()); // nrhs . nrhs -- the coefficients
227 acceleratorCopyFromDevice(&BLAS_C[0],&HOST_C[0],BLAS_C.size()*sizeof(scalar));
228 grid->GlobalSumVector(&HOST_C[0],nrhs*nrhs);
229
230 RealD t5 = usecond();
231 for(int rr=0;rr<nrhs;rr++){
232 for(int r=0;r<nrhs;r++){
233 int off = r+nrhs*rr;
234 m(r,rr)=HOST_C[off];
235 }
236 }
237 RealD t6 = usecond();
238 uint64_t M=nrhs;
239 uint64_t N=nrhs;
240 uint64_t K=vw;
241 RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
242 RealD flops = 8.0*M*N*K;
243 flops = flops/(t4-t3)/1.e3;
244 bytes = bytes/(t4-t3)/1.e3;
245 std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
246 std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
247 std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us"<<std::endl;
248 std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
249 std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
250 std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
251 std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
252 std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t5 "<< t5-t4<<" us"<<std::endl;
253 std::cout <<GridLogPerformance<< "InnerProductMatrix cp t6 "<< t6-t5<<" us"<<std::endl;
254 std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
255#else
256 int nrhs;
257 GridBase *grid;
258 uint64_t vol;
259 uint64_t words;
260
261 nrhs = X.size();
262 assert(X.size()==Y.size());
263 conformable(X[0],Y[0]);
264
265 grid = X[0].Grid();
266 int rd0 = grid->_rdimensions[0] * grid->_rdimensions[1];
267 vol = grid->oSites()/rd0;
268 words = rd0*sizeof(vector_object)/sizeof(scalar);
269 int64_t vw = vol * words;
270 assert(vw == grid->lSites()*sizeof(scalar_object)/sizeof(scalar));
271
272 RealD t0 = usecond();
273 BLAS_X.resize(nrhs * vw); // cost free if size doesn't change
274 BLAS_Y.resize(nrhs * vw); // cost free if size doesn't change
275 BLAS_Cred.resize(nrhs * nrhs * vol);// cost free if size doesn't change
276 RealD t1 = usecond();
277
279 // Copy in the multi-rhs sources -- layout batched BLAS ready
281 for(int r=0;r<nrhs;r++){
282 autoView(x_v,X[r],AcceleratorRead);
283 autoView(y_v,Y[r],AcceleratorRead);
284 scalar *from_x=(scalar *)&x_v[0];
285 scalar *from_y=(scalar *)&y_v[0];
286 scalar *BX = &BLAS_X[0];
287 scalar *BY = &BLAS_Y[0];
288 accelerator_for(ssw,vw,1,{
289 uint64_t ss=ssw/words;
290 uint64_t w=ssw%words;
291 uint64_t offset = w+r*words+ss*nrhs*words; // [ss][rhs][words]
292 BX[offset] = from_x[ssw];
293 BY[offset] = from_y[ssw];
294 });
295 }
296 RealD t2 = usecond();
297
298 /*
299 * in Fortran column major notation (cuBlas order)
300 *
301 * Xxr = [X1(x)][..][Xn(x)]
302 *
303 * Yxr = [Y1(x)][..][Ym(x)]
304 *
305 * C_rs = X^dag Y
306 */
307 Xdip.resize(vol);
308 Ydip.resize(vol);
309 Cdip.resize(vol);
310 std::vector<scalar *> Xh(vol);
311 std::vector<scalar *> Yh(vol);
312 std::vector<scalar *> Ch(vol);
313 for(uint64_t ss=0;ss<vol;ss++){
314
315 Xh[ss] = & BLAS_X[ss*nrhs*words];
316 Yh[ss] = & BLAS_Y[ss*nrhs*words];
317 Ch[ss] = & BLAS_Cred[ss*nrhs*nrhs];
318
319 }
320 acceleratorCopyToDevice(&Xh[0],&Xdip[0],vol*sizeof(scalar *));
321 acceleratorCopyToDevice(&Yh[0],&Ydip[0],vol*sizeof(scalar *));
322 acceleratorCopyToDevice(&Ch[0],&Cdip[0],vol*sizeof(scalar *));
323
324 GridBLAS BLAS;
325
326 RealD t3 = usecond();
328 // C_rs = X^dag Y
331 nrhs,nrhs,words,
332 ComplexD(1.0),
333 Xdip,
334 Ydip,
335 ComplexD(0.0), // wipe out C
336 Cdip);
337 BLAS.synchronise();
338 RealD t4 = usecond();
339
340 std::vector<scalar> HOST_C(BLAS_Cred.size()); // nrhs . nrhs -- the coefficients
341 acceleratorCopyFromDevice(&BLAS_Cred[0],&HOST_C[0],BLAS_Cred.size()*sizeof(scalar));
342
343 RealD t5 = usecond();
344 m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
345 for(int ss=0;ss<vol;ss++){
346 Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
347 m = m + eC;
348 }
349 RealD t6l = usecond();
350 grid->GlobalSumVector((scalar *) &m(0,0),nrhs*nrhs);
351 RealD t6 = usecond();
352 uint64_t M=nrhs;
353 uint64_t N=nrhs;
354 uint64_t K=vw;
355 RealD xybytes = grid->lSites()*sizeof(scalar_object);
356 RealD bytes = 1.0*sizeof(ComplexD)*(M*N*2+N*K+M*K);
357 RealD flops = 8.0*M*N*K;
358 flops = flops/(t4-t3)/1.e3;
359 bytes = bytes/(t4-t3)/1.e3;
360 xybytes = 4*xybytes/(t2-t1)/1.e3;
361 std::cout <<GridLogPerformance<< "InnerProductMatrix m,n,k "<< M<<","<<N<<","<<K<<std::endl;
362 std::cout <<GridLogPerformance<< "InnerProductMatrix alloc t1 "<< t1-t0<<" us"<<std::endl;
363 std::cout <<GridLogPerformance<< "InnerProductMatrix cp t2 "<< t2-t1<<" us "<<xybytes<<" GB/s"<<std::endl;
364 std::cout <<GridLogPerformance<< "InnerProductMatrix setup t3 "<< t3-t2<<" us"<<std::endl;
365 std::cout <<GridLogPerformance<< "InnerProductMatrix blas t4 "<< t4-t3<<" us"<<std::endl;
366 std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< flops<<" GF/s"<<std::endl;
367 std::cout <<GridLogPerformance<< "InnerProductMatrix blas "<< bytes<<" GB/s"<<std::endl;
368 std::cout <<GridLogPerformance<< "InnerProductMatrix cp t5 "<< t5-t4<<" us"<<std::endl;
369 std::cout <<GridLogPerformance<< "InnerProductMatrix lsum t6l "<< t6l-t5<<" us"<<std::endl;
370 std::cout <<GridLogPerformance<< "InnerProductMatrix gsum t6 "<< t6-t6l<<" us"<<std::endl;
371 std::cout <<GridLogPerformance<< "InnerProductMatrix took "<< t6-t0<<" us"<<std::endl;
372#endif
373 }
374};
375
void acceleratorPut(T &dev, const T &host)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorCopyDeviceToDevice(void *from, void *to, size_t bytes)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
std::vector< T, devAllocator< T > > deviceVector
@ GridBLAS_OP_N
Definition BatchedBlas.h:67
@ GridBLAS_OP_C
Definition BatchedBlas.h:67
void conformable(const Lattice< obj1 > &lhs, const Lattice< obj2 > &rhs)
#define autoView(l_v, l, mode)
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN")
@ AcceleratorRead
@ AcceleratorWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
std::complex< RealD > ComplexD
Definition Simd.h:79
double RealD
Definition Simd.h:61
double usecond(void)
Definition Timer.h:50
static INTERNAL_PRECISION K
Definition Zolotarev.cc:230
void GlobalSumVector(RealF *, int N)
void synchronise(void)
void gemmBatched(int m, int n, int k, ComplexD alpha, deviceVector< ComplexD * > &Amk, deviceVector< ComplexD * > &Bkn, ComplexD beta, deviceVector< ComplexD * > &Cmn)
int lSites(void) const
int oSites(void) const
Coordinate _rdimensions
Field::vector_object vector_object
Field::scalar_object scalar_object
void InnerProductMatrix(Eigen::MatrixXcd &m, const std::vector< Field > &X, const std::vector< Field > &Y)
deviceVector< scalar > BLAS_Cred
deviceVector< scalar * > Cdip
deviceVector< scalar > BLAS_Y
deviceVector< scalar * > Xdip
deviceVector< scalar * > Ydip
deviceVector< scalar > BLAS_C
void MaddMatrix(std::vector< Field > &AP, Eigen::MatrixXcd &m, const std::vector< Field > &X, const std::vector< Field > &Y, RealD scale=1.0)
deviceVector< scalar > BLAS_X
void MulMatrix(std::vector< Field > &Y, Eigen::MatrixXcd &m, const std::vector< Field > &X)