35#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
36 SE = st.GetEntry(ptype, Dir+skew, sF); \
37 if (SE->_is_local ) { \
38 int perm= SE->_permute; \
39 chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
41 chi = coalescedRead(buf[SE->_offset],lane); \
43 acceleratorSynchronise(); \
44 multLink(Uchi, U[sU], chi, Dir);
46#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
47 SE = st.GetEntry(ptype, Dir+skew, sF); \
48 if (SE->_is_local ) { \
49 int perm= SE->_permute; \
50 chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
51 } else if ( st.same_node[Dir] ) { \
52 chi = coalescedRead(buf[SE->_offset],lane); \
54 if (SE->_is_local || st.same_node[Dir] ) { \
55 multLink(Uchi, U[sU], chi, Dir); \
58#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
59 SE = st.GetEntry(ptype, Dir+skew, sF); \
60 if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
62 chi = coalescedRead(buf[SE->_offset],lane); \
63 multLink(Uchi, U[sU], chi, Dir); \
76 DoubledGaugeFieldView &
U, DoubledGaugeFieldView &UUU,
77 SiteSpinor *buf,
int sF,
int sU,
78 const FermionFieldView &in, FermionFieldView &out,
int dag)
86 const int Nsimd = SiteHalfSpinor::Nsimd();
126 DoubledGaugeFieldView &
U, DoubledGaugeFieldView &UUU,
127 SiteSpinor *buf,
int sF,
int sU,
128 const FermionFieldView &in, FermionFieldView &out,
int dag)
136 const int Nsimd = SiteHalfSpinor::Nsimd();
177 DoubledGaugeFieldView &
U, DoubledGaugeFieldView &UUU,
178 SiteSpinor *buf,
int sF,
int sU,
179 const FermionFieldView &in, FermionFieldView &out,
int dag)
188 const int Nsimd = SiteHalfSpinor::Nsimd();
231 int sF,
int sU,
const FermionFieldView &in, FermionFieldView &out,
int dir,
int disp)
240#define KERNEL_CALLNB(A,improved) \
241 const uint64_t NN = Nsite*Ls; \
242 accelerator_forNB( ss, NN, Simd::Nsimd(), { \
245 ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
248#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
251 const uint64_t NN = Nsite*Ls; \
252 thread_for( ss, NN, { \
255 ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
260 DoubledGaugeField &
U, DoubledGaugeField &UUU,
261 const FermionField &in, FermionField &out,
int dag,
int interior,
int exterior)
266 const int Nsimd = SiteHalfSpinor::Nsimd();
273 SiteSpinor * buf = st.CommBuf();
276 if(FGrid->
Nd()==UGrid->
Nd()+1){
279 int Nsite = UGrid->
oSites();
281 if( interior && exterior ) {
287 }
else if( interior ) {
290 }
else if( exterior ) {
294 assert(0 &&
" Kernel optimisation case not covered ");
298 DoubledGaugeField &
U,
299 const FermionField &in, FermionField &out,
int dag,
int interior,
int exterior)
304 const int Nsimd = SiteHalfSpinor::Nsimd();
311 SiteSpinor * buf = st.CommBuf();
314 if(FGrid->
Nd()==UGrid->
Nd()+1){
317 int Nsite = UGrid->
oSites();
319 if( interior && exterior ) {
322 }
else if( interior ) {
325 }
else if( exterior ) {
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
#define autoView(l_v, l, mode)
#define NAMESPACE_BEGIN(A)
#define GENERIC_STENCIL_LEG_EXT(U, Dir, skew, multLink)
#define GENERIC_STENCIL_LEG_INT(U, Dir, skew, multLink)
#define KERNEL_CALL(A, improved)
#define GENERIC_STENCIL_LEG(U, Dir, skew, multLink)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
static INTERNAL_PRECISION U
void DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior, int exterior)
void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
StaggeredKernels(const ImplParams &p=ImplParams())
void DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior, int exterior)
static accelerator_inline void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
FermionOperator< Impl > Base
static accelerator_inline void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)