57#define GENERIC_STENCIL_LEG(Dir,spProj,Recon) \
58 SE = st.GetEntry(ptype, Dir, sF); \
59 if (SE->_is_local) { \
60 int perm= SE->_permute; \
61 auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
64 chi = coalescedRead(buf[SE->_offset],lane); \
66 acceleratorSynchronise(); \
67 Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
70#define GENERIC_STENCIL_LEG_INT(Dir,spProj,Recon) \
71 SE = st.GetEntry(ptype, Dir, sF); \
72 if (SE->_is_local) { \
73 int perm= SE->_permute; \
74 auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
76 Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
77 Recon(result, Uchi); \
79 acceleratorSynchronise();
81#define GENERIC_STENCIL_LEG_EXT(Dir,spProj,Recon) \
82 SE = st.GetEntry(ptype, Dir, sF); \
83 if (!SE->_is_local ) { \
84 auto chi = coalescedRead(buf[SE->_offset],lane); \
85 Impl::multLink(Uchi, U[sU], chi, Dir, SE, st); \
86 Recon(result, Uchi); \
89 acceleratorSynchronise();
91#define GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon) \
92 if (SE->_is_local ) { \
93 int perm= SE->_permute; \
94 auto tmp = coalescedReadPermute(in[SE->_offset],ptype,perm,lane); \
97 chi = coalescedRead(buf[SE->_offset],lane); \
99 acceleratorSynchronise(); \
100 Impl::multLink(Uchi, U[sU], chi, dir, SE, st); \
103#define GENERIC_DHOPDIR_LEG(Dir,spProj,Recon) \
104 if (gamma == Dir) { \
105 GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,Recon); \
114 SiteHalfSpinor *buf,
int sF,
115 int sU,
const FermionFieldView &in, FermionFieldView &out)
125 const int Nsimd = SiteHalfSpinor::Nsimd();
140 SiteHalfSpinor *buf,
int sF,
141 int sU,
const FermionFieldView &in, FermionFieldView &out)
152 const int Nsimd = SiteHalfSpinor::Nsimd();
169 SiteHalfSpinor *buf,
int sF,
170 int sU,
const FermionFieldView &in, FermionFieldView &out)
180 const int Nsimd = SiteHalfSpinor::Nsimd();
197 SiteHalfSpinor *buf,
int sF,
198 int sU,
const FermionFieldView &in, FermionFieldView &out)
202 const int Nsimd = SiteHalfSpinor::Nsimd();
227 SiteHalfSpinor *buf,
int sF,
228 int sU,
const FermionFieldView &in, FermionFieldView &out)
238 const int Nsimd = SiteHalfSpinor::Nsimd();
251 out_t = out_t + result;
258 SiteHalfSpinor *buf,
int sF,
259 int sU,
const FermionFieldView &in, FermionFieldView &out)
269 const int Nsimd = SiteHalfSpinor::Nsimd();
282 out_t = out_t + result;
287#define DhopDirMacro(Dir,spProj,spRecon) \
288 template <class Impl> accelerator_inline \
289 void WilsonKernels<Impl>::DhopDir##Dir(StencilView &st, DoubledGaugeFieldView &U,SiteHalfSpinor *buf, int sF, \
290 int sU, const FermionFieldView &in, FermionFieldView &out, int dir) \
292 typedef decltype(coalescedRead(buf[0])) calcHalfSpinor; \
293 typedef decltype(coalescedRead(in[0])) calcSpinor; \
294 calcHalfSpinor chi; \
296 calcHalfSpinor Uchi; \
299 const int Nsimd = SiteHalfSpinor::Nsimd(); \
300 const int lane=acceleratorSIMTlane(Nsimd); \
302 SE = st.GetEntry(ptype, dir, sF); \
303 GENERIC_DHOPDIR_LEG_BODY(Dir,spProj,spRecon); \
304 coalescedWrite(out[sF], result,lane); \
318 int sU,
const FermionFieldView &in, FermionFieldView &out,
int dir,
int gamma)
327 const int Nsimd = SiteHalfSpinor::Nsimd();
330 SE = st.GetEntry(
ptype, dir, sF);
344 int Nsite,
const FermionField &in, std::vector<FermionField> &out)
358 auto CBp=st.CommBuf();
362 DhopDirXm(st_v,U_v,CBp,sF,sU,in_v,out_Xm,0);
363 DhopDirYm(st_v,U_v,CBp,sF,sU,in_v,out_Ym,1);
364 DhopDirZm(st_v,U_v,CBp,sF,sU,in_v,out_Zm,2);
365 DhopDirTm(st_v,U_v,CBp,sF,sU,in_v,out_Tm,3);
366 DhopDirXp(st_v,U_v,CBp,sF,sU,in_v,out_Xp,4);
367 DhopDirYp(st_v,U_v,CBp,sF,sU,in_v,out_Yp,5);
368 DhopDirZp(st_v,U_v,CBp,sF,sU,in_v,out_Zp,6);
369 DhopDirTp(st_v,U_v,CBp,sF,sU,in_v,out_Tp,7);
376 int Nsite,
const FermionField &in, FermionField &out,
int dirdisp,
int gamma)
385 auto CBp=st.CommBuf();
386#define LoopBody(Dir) \
388 accelerator_for(ss,Nsite,Simd::Nsimd(),{ \
389 for(int s=0;s<Ls;s++){ \
392 DhopDir##Dir(st_v,U_v,CBp,sF,sU,in_v,out_v,dirdisp);\
416 ulong SYCL_EXTERNAL __attribute__((overloadable)) intel_get_cycle_counter(
void );
417 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_active_channel_mask(
void );
418 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_grf_register( uint reg );
419 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_flag_register( uint flag );
420 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_control_register( uint reg );
421 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_hw_thread_id(
void );
422 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_slice_id(
void );
423 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_subslice_id(
void );
424 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_id(
void );
425 uint SYCL_EXTERNAL __attribute__((overloadable)) intel_get_eu_thread_id(
void );
426 void SYCL_EXTERNAL __attribute__((overloadable)) intel_eu_thread_pause( uint value );
429#define MAKE_ID(A) (intel_get_eu_id()<<16)|(intel_get_slice_id()<<8)|(intel_get_subslice_id())
431#define MAKE_ID(A) (0)
436#define MAKE_ID(A) (0)
441#define KERNEL_CALL_ID(A) \
442 const uint64_t NN = Nsite*Ls; \
443 accelerator_forNB( ss, NN, Simd::Nsimd(), { \
446 WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
447 const int Nsimd = SiteHalfSpinor::Nsimd(); \
448 const int lane=acceleratorSIMTlane(Nsimd); \
449 int idx=sF*Nsimd+lane; \
450 uint64_t id = MAKE_ID(); \
453 accelerator_barrier();
455#define KERNEL_CALLNB(A) \
456 const uint64_t NN = Nsite*Ls; \
457 accelerator_forNB( ss, NN, Simd::Nsimd(), { \
460 WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
463#define KERNEL_CALL(A) KERNEL_CALLNB(A); accelerator_barrier();
465#define KERNEL_CALL_EXT(A) \
466 const uint64_t sz = st.surface_list.size(); \
467 auto ptr = &st.surface_list[0]; \
468 accelerator_forNB( ss, sz, Simd::Nsimd(), { \
471 WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,in_v,out_v); \
473 accelerator_barrier();
476 thread_for( sss, Nsite, { \
480 WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
482#define ASM_CALL_SLICE(A) \
483 auto grid = in.Grid() ; \
484 int nt = grid->LocalDimensions()[4]; \
485 int nxyz = Nsite/nt ; \
486 for(int t=0;t<nt;t++){ \
487 thread_for( sss, nxyz, { \
488 int ss = t*nxyz+sss; \
491 WilsonKernels<Impl>::A(st_v,U_v,buf,sF,sU,Ls,1,in_v,out_v); \
498 int Ls,
int Nsite,
const FermionField &in, FermionField &out,
499 int interior,
int exterior)
506 if( interior && exterior ) {
513 }
else if( interior ) {
519 }
else if( exterior ) {
528 assert(0 &&
" Kernel optimisation case not covered ");
533 int Ls,
int Nsite,
const FermionField &in, FermionField &out,
542 template <
class Impl>
544 int Ls,
int Nsite,
const FermionField &in, FermionField &out,
545 int interior,
int exterior)
552 if( interior && exterior ) {
559 }
else if( interior ) {
565 }
else if( exterior ) {
574 assert(0 &&
" Kernel optimisation case not covered ");
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
void acceleratorFenceComputeStream(void)
#define accelerator_for(iterator, num, nsimd,...)
#define autoView(l_v, l, mode)
#define NAMESPACE_BEGIN(A)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
accelerator_inline void spProjXp(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spReconZm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void accumReconYp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spProjYm(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void accumReconZm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spProjTm(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spProjZp(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spProjTp(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spReconXp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spReconTp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spProjZm(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void spReconTm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spReconYp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void accumReconYm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spProjXm(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void accumReconTp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spProjYp(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
accelerator_inline void accumReconZp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spReconXm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void accumReconXp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void accumReconTm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void accumReconXm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spReconYm(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
accelerator_inline void spReconZp(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
#define GENERIC_STENCIL_LEG_INT(Dir, spProj, Recon)
#define KERNEL_CALL_ID(A)
#define KERNEL_CALL_EXT(A)
#define GENERIC_STENCIL_LEG_EXT(Dir, spProj, Recon)
#define DhopDirMacro(Dir, spProj, spRecon)
#define GENERIC_DHOPDIR_LEG(Dir, spProj, Recon)
#define GENERIC_STENCIL_LEG(Dir, spProj, Recon)
static INTERNAL_PRECISION U
static accelerator_inline void DhopDirXm(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator_inline void DhopDirYp(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator_inline void DhopDirK(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp, int gamma)
static void AsmDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out)
static accelerator_inline void DhopDirTm(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator void GenericDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static accelerator void GenericDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static void AsmDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out)
static void DhopDagKernel(int Opt, StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1, int exterior=1)
static accelerator void HandDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static accelerator void HandDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static accelerator void HandDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static accelerator_inline void DhopDirXp(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator void HandDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static accelerator void GenericDhopSiteExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static void AsmDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out)
static void DhopDirKernel(StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int dirdisp, int gamma)
static void AsmDhopSiteDagExt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out)
static accelerator_inline void DhopDirTp(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator void GenericDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static accelerator_inline void DhopDirZm(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator_inline void DhopDirYm(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator void GenericDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static void DhopDirAll(StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, std::vector< FermionField > &out)
static accelerator_inline void DhopDirZp(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dirdisp)
static accelerator void HandDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static void AsmDhopSiteDagInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out)
static void AsmDhopSiteInt(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, int Ls, int Nsite, const FermionFieldView &in, FermionFieldView &out)
static accelerator void HandDhopSiteDag(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static accelerator void GenericDhopSite(StencilView &st, DoubledGaugeFieldView &U, SiteHalfSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out)
static void DhopKernel(int Opt, StencilImpl &st, DoubledGaugeField &U, SiteHalfSpinor *buf, int Ls, int Nsite, const FermionField &in, FermionField &out, int interior=1, int exterior=1)