37#define LOAD_CHI(ptype,b) \
38 const SiteSpinor & ref (b[offset]); \
39 Chi_0=coalescedReadPermute<ptype>(ref()()(0),perm,lane); \
40 Chi_1=coalescedReadPermute<ptype>(ref()()(1),perm,lane); \
41 Chi_2=coalescedReadPermute<ptype>(ref()()(2),perm,lane);
43#define LOAD_CHI_COMMS(b) \
44 const SiteSpinor & ref (b[offset]); \
45 Chi_0=coalescedRead(ref()()(0),lane); \
46 Chi_1=coalescedRead(ref()()(1),lane); \
47 Chi_2=coalescedRead(ref()()(2),lane);
49#define PERMUTE_DIR(dir) ;
51#define LOAD_CHI(ptype,b) LOAD_CHI_COMMS(b)
53#define LOAD_CHI_COMMS(b) \
54 const SiteSpinor & ref (b[offset]); \
59#define PERMUTE_DIR(dir) \
60 permute##dir(Chi_0,Chi_0); \
61 permute##dir(Chi_1,Chi_1); \
62 permute##dir(Chi_2,Chi_2);
69 auto & ref(U[sU](A)); \
70 U_00=coalescedRead(ref()(0,0),lane); \
71 U_10=coalescedRead(ref()(1,0),lane); \
72 U_20=coalescedRead(ref()(2,0),lane); \
73 U_01=coalescedRead(ref()(0,1),lane); \
74 U_11=coalescedRead(ref()(1,1),lane); \
75 U_21=coalescedRead(ref()(2,1),lane); \
76 U_02=coalescedRead(ref()(0,2),lane); \
77 U_12=coalescedRead(ref()(1,2),lane); \
78 U_22=coalescedRead(ref()(2,2),lane); \
79 UChi ## _0 = U_00*Chi_0; \
80 UChi ## _1 = U_10*Chi_0;\
81 UChi ## _2 = U_20*Chi_0;\
82 UChi ## _0 += U_01*Chi_1;\
83 UChi ## _1 += U_11*Chi_1;\
84 UChi ## _2 += U_21*Chi_1;\
85 UChi ## _0 += U_02*Chi_2;\
86 UChi ## _1 += U_12*Chi_2;\
87 UChi ## _2 += U_22*Chi_2;
89#define MULT_ADD(U,A,UChi) \
90 auto & ref(U[sU](A)); \
91 U_00=coalescedRead(ref()(0,0),lane); \
92 U_10=coalescedRead(ref()(1,0),lane); \
93 U_20=coalescedRead(ref()(2,0),lane); \
94 U_01=coalescedRead(ref()(0,1),lane); \
95 U_11=coalescedRead(ref()(1,1),lane); \
96 U_21=coalescedRead(ref()(2,1),lane); \
97 U_02=coalescedRead(ref()(0,2),lane); \
98 U_12=coalescedRead(ref()(1,2),lane); \
99 U_22=coalescedRead(ref()(2,2),lane); \
100 UChi ## _0 += U_00*Chi_0; \
101 UChi ## _1 += U_10*Chi_0;\
102 UChi ## _2 += U_20*Chi_0;\
103 UChi ## _0 += U_01*Chi_1;\
104 UChi ## _1 += U_11*Chi_1;\
105 UChi ## _2 += U_21*Chi_1;\
106 UChi ## _0 += U_02*Chi_2;\
107 UChi ## _1 += U_12*Chi_2;\
108 UChi ## _2 += U_22*Chi_2;
111#define HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
112 SE=st.GetEntry(ptype,Dir+skew,sF); \
113 offset = SE->_offset; \
114 local = SE->_is_local; \
115 perm = SE->_permute; \
122 LOAD_CHI_COMMS(buf); \
125#define HAND_STENCIL_LEG_BEGIN(Dir,Perm,skew,even) \
126 HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
131#define HAND_STENCIL_LEG(U,Dir,Perm,skew,even) \
132 HAND_STENCIL_LEG_BASE(Dir,Perm,skew) \
134 MULT_ADD(U,Dir,even); \
138#define HAND_STENCIL_LEG_INT(U,Dir,Perm,skew,even) \
139 SE=st.GetEntry(ptype,Dir+skew,sF); \
140 offset = SE->_offset; \
141 local = SE->_is_local; \
142 perm = SE->_permute; \
148 } else if ( st.same_node[Dir] ) { \
149 LOAD_CHI_COMMS(buf); \
151 if (local || st.same_node[Dir] ) { \
152 MULT_ADD(U,Dir,even); \
155#define HAND_STENCIL_LEG_EXT(U,Dir,Perm,skew,even) \
156 SE=st.GetEntry(ptype,Dir+skew,sF); \
157 offset = SE->_offset; \
158 local = SE->_is_local; \
159 if ((!local) && (!st.same_node[Dir]) ) { \
161 { LOAD_CHI_COMMS(buf); } \
162 { MULT_ADD(U,Dir,even); } \
165#define HAND_DECLARATIONS(Simd) \
191 DoubledGaugeFieldView &
U,DoubledGaugeFieldView &UUU,
192 SiteSpinor *buf,
int sF,
int sU,
193 const FermionFieldView &in, FermionFieldView &out,
int dag)
195 typedef typename Simd::scalar_type S;
196 typedef typename Simd::vector_type V;
199 const int Nsimd = SiteHalfSpinor::Nsimd();
205 calcSiteSpinor result;
236 result()()(0) = - even_0 - odd_0;
237 result()()(1) = - even_1 - odd_1;
238 result()()(2) = - even_2 - odd_2;
240 result()()(0) = even_0 + odd_0;
241 result()()(1) = even_1 + odd_1;
242 result()()(2) = even_2 + odd_2;
252 DoubledGaugeFieldView &
U, DoubledGaugeFieldView &UUU,
253 SiteSpinor *buf,
int sF,
int sU,
254 const FermionFieldView &in, FermionFieldView &out,
int dag)
256 typedef typename Simd::scalar_type S;
257 typedef typename Simd::vector_type V;
259 const int Nsimd = SiteHalfSpinor::Nsimd();
265 calcSiteSpinor result;
300 result()()(0) = - even_0 - odd_0;
301 result()()(1) = - even_1 - odd_1;
302 result()()(2) = - even_2 - odd_2;
304 result()()(0) = even_0 + odd_0;
305 result()()(1) = even_1 + odd_1;
306 result()()(2) = even_2 + odd_2;
316 DoubledGaugeFieldView &
U, DoubledGaugeFieldView &UUU,
317 SiteSpinor *buf,
int sF,
int sU,
318 const FermionFieldView &in, FermionFieldView &out,
int dag)
320 typedef typename Simd::scalar_type S;
321 typedef typename Simd::vector_type V;
323 const int Nsimd = SiteHalfSpinor::Nsimd();
329 calcSiteSpinor result;
365 result()()(0) = - even_0 - odd_0;
366 result()()(1) = - even_1 - odd_1;
367 result()()(2) = - even_2 - odd_2;
369 result()()(0) = even_0 + odd_0;
370 result()()(1) = even_1 + odd_1;
371 result()()(2) = even_2 + odd_2;
379#undef HAND_DECLARATIONS
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
accelerator_inline void zeroit(Grid_simd2< S, V > &z)
#define NAMESPACE_BEGIN(A)
#define HAND_STENCIL_LEG(U, Dir, Perm, skew, even)
#define HAND_STENCIL_LEG_EXT(U, Dir, Perm, skew, even)
#define HAND_STENCIL_LEG_BEGIN(Dir, Perm, skew, even)
#define HAND_STENCIL_LEG_INT(U, Dir, Perm, skew, even)
#define HAND_DECLARATIONS(Simd)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
static INTERNAL_PRECISION U
static accelerator_inline void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)