39 typedef typename Field::scalar_type
scalar;
64 void MaddMatrix(std::vector<Field> &AP, Eigen::MatrixXcd &m ,
const std::vector<Field> &X,
const std::vector<Field> &Y,
RealD scale=1.0)
66 std::vector<Field> Y_copy(AP.size(),AP[0].Grid());
67 for(
int r=0;r<AP.size();r++){
71 for(
int r=0;r<AP.size();r++){
72 AP[r] = scale*AP[r]+Y_copy[r];
75 void MulMatrix(std::vector<Field> &Y, Eigen::MatrixXcd &m ,
const std::vector<Field> &X)
77 typedef typename Field::scalar_type scomplex;
86 int64_t vw = vol * words;
91 BLAS_C.resize(nrhs * nrhs);
97 for(
int r=0;r<nrhs;r++){
98 int64_t offset = r*vw;
141 for(
int r=0;r<nrhs;r++){
142 int64_t offset = r*vw;
148 std::cout <<
GridLogPerformance<<
"MulMatrix preamble took "<< t2-t1<<
" us"<<std::endl;
154 void InnerProductMatrix(Eigen::MatrixXcd &m ,
const std::vector<Field> &X,
const std::vector<Field> &Y)
163 assert(X.size()==Y.size());
169 int64_t vw = vol * words;
174 BLAS_C.resize(nrhs * nrhs);
180 for(
int r=0;r<nrhs;r++){
181 int64_t offset = r*vw;
226 std::vector<scalar> HOST_C(
BLAS_C.size());
231 for(
int rr=0;rr<nrhs;rr++){
232 for(
int r=0;r<nrhs;r++){
243 flops = flops/(t4-t3)/1.e3;
244 bytes = bytes/(t4-t3)/1.e3;
245 std::cout <<
GridLogPerformance<<
"InnerProductMatrix m,n,k "<< M<<
","<<N<<
","<<
K<<std::endl;
246 std::cout <<
GridLogPerformance<<
"InnerProductMatrix alloc t1 "<< t1-t0<<
" us"<<std::endl;
247 std::cout <<
GridLogPerformance<<
"InnerProductMatrix cp t2 "<< t2-t1<<
" us"<<std::endl;
248 std::cout <<
GridLogPerformance<<
"InnerProductMatrix setup t3 "<< t3-t2<<
" us"<<std::endl;
249 std::cout <<
GridLogPerformance<<
"InnerProductMatrix blas t4 "<< t4-t3<<
" us"<<std::endl;
250 std::cout <<
GridLogPerformance<<
"InnerProductMatrix blas "<< flops<<
" GF/s"<<std::endl;
251 std::cout <<
GridLogPerformance<<
"InnerProductMatrix blas "<< bytes<<
" GB/s"<<std::endl;
252 std::cout <<
GridLogPerformance<<
"InnerProductMatrix gsum t5 "<< t5-t4<<
" us"<<std::endl;
253 std::cout <<
GridLogPerformance<<
"InnerProductMatrix cp t6 "<< t6-t5<<
" us"<<std::endl;
254 std::cout <<
GridLogPerformance<<
"InnerProductMatrix took "<< t6-t0<<
" us"<<std::endl;
262 assert(X.size()==Y.size());
269 int64_t vw = vol * words;
281 for(
int r=0;r<nrhs;r++){
289 uint64_t ss=ssw/words;
290 uint64_t w=ssw%words;
291 uint64_t offset = w+r*words+ss*nrhs*words;
292 BX[offset] = from_x[ssw];
293 BY[offset] = from_y[ssw];
310 std::vector<scalar *> Xh(vol);
311 std::vector<scalar *> Yh(vol);
312 std::vector<scalar *> Ch(vol);
313 for(uint64_t ss=0;ss<vol;ss++){
315 Xh[ss] = &
BLAS_X[ss*nrhs*words];
316 Yh[ss] = &
BLAS_Y[ss*nrhs*words];
340 std::vector<scalar> HOST_C(
BLAS_Cred.size());
344 m = Eigen::MatrixXcd::Zero(nrhs,nrhs);
345 for(
int ss=0;ss<vol;ss++){
346 Eigen::Map<Eigen::MatrixXcd> eC((std::complex<double> *)&HOST_C[ss*nrhs*nrhs],nrhs,nrhs);
358 flops = flops/(t4-t3)/1.e3;
359 bytes = bytes/(t4-t3)/1.e3;
360 xybytes = 4*xybytes/(t2-t1)/1.e3;
361 std::cout <<
GridLogPerformance<<
"InnerProductMatrix m,n,k "<< M<<
","<<N<<
","<<
K<<std::endl;
362 std::cout <<
GridLogPerformance<<
"InnerProductMatrix alloc t1 "<< t1-t0<<
" us"<<std::endl;
363 std::cout <<
GridLogPerformance<<
"InnerProductMatrix cp t2 "<< t2-t1<<
" us "<<xybytes<<
" GB/s"<<std::endl;
364 std::cout <<
GridLogPerformance<<
"InnerProductMatrix setup t3 "<< t3-t2<<
" us"<<std::endl;
365 std::cout <<
GridLogPerformance<<
"InnerProductMatrix blas t4 "<< t4-t3<<
" us"<<std::endl;
366 std::cout <<
GridLogPerformance<<
"InnerProductMatrix blas "<< flops<<
" GF/s"<<std::endl;
367 std::cout <<
GridLogPerformance<<
"InnerProductMatrix blas "<< bytes<<
" GB/s"<<std::endl;
368 std::cout <<
GridLogPerformance<<
"InnerProductMatrix cp t5 "<< t5-t4<<
" us"<<std::endl;
369 std::cout <<
GridLogPerformance<<
"InnerProductMatrix lsum t6l "<< t6l-t5<<
" us"<<std::endl;
370 std::cout <<
GridLogPerformance<<
"InnerProductMatrix gsum t6 "<< t6-t6l<<
" us"<<std::endl;
371 std::cout <<
GridLogPerformance<<
"InnerProductMatrix took "<< t6-t0<<
" us"<<std::endl;
void acceleratorPut(T &dev, const T &host)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorCopyDeviceToDevice(void *from, void *to, size_t bytes)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
std::vector< T, devAllocator< T > > deviceVector
#define autoView(l_v, l, mode)
GridLogger GridLogPerformance(1, "Performance", GridLogColours, "GREEN")
#define NAMESPACE_BEGIN(A)
std::complex< RealD > ComplexD
static INTERNAL_PRECISION K
void GlobalSumVector(RealF *, int N)
void gemmBatched(int m, int n, int k, ComplexD alpha, deviceVector< ComplexD * > &Amk, deviceVector< ComplexD * > &Bkn, ComplexD beta, deviceVector< ComplexD * > &Cmn)
Field::vector_object vector_object
Field::scalar_object scalar_object
void InnerProductMatrix(Eigen::MatrixXcd &m, const std::vector< Field > &X, const std::vector< Field > &Y)
deviceVector< scalar > BLAS_Cred
deviceVector< scalar * > Cdip
deviceVector< scalar > BLAS_Y
deviceVector< scalar * > Xdip
deviceVector< scalar * > Ydip
deviceVector< scalar > BLAS_C
void MaddMatrix(std::vector< Field > &AP, Eigen::MatrixXcd &m, const std::vector< Field > &X, const std::vector< Field > &Y, RealD scale=1.0)
deviceVector< scalar > BLAS_X
void MulMatrix(std::vector< Field > &Y, Eigen::MatrixXcd &m, const std::vector< Field > &X)
Field::scalar_type scalar