Grid 0.7.0
StaggeredKernelsImplementation.h
Go to the documentation of this file.
1/*************************************************************************************
2
3Grid physics library, www.github.com/paboyle/Grid
4
5Source file: ./lib/qcd/action/fermion/WilsonKernels.cc
6
7Copyright (C) 2015
8
9Author: Azusa Yamaguchi, Peter Boyle
10
11This program is free software; you can redistribute it and/or modify
12it under the terms of the GNU General Public License as published by
13the Free Software Foundation; either version 2 of the License, or
14(at your option) any later version.
15
16This program is distributed in the hope that it will be useful,
17but WITHOUT ANY WARRANTY; without even the implied warranty of
18MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19GNU General Public License for more details.
20
21You should have received a copy of the GNU General Public License along
22with this program; if not, write to the Free Software Foundation, Inc.,
2351 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25See the full license in the file "LICENSE" in the top level distribution
26directory
27*************************************************************************************/
28/* END LEGAL */
30
31#pragma once
32
34
35#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink) \
36 SE = st.GetEntry(ptype, Dir+skew, sF); \
37 if (SE->_is_local ) { \
38 int perm= SE->_permute; \
39 chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
40 } else { \
41 chi = coalescedRead(buf[SE->_offset],lane); \
42 } \
43 acceleratorSynchronise(); \
44 multLink(Uchi, U[sU], chi, Dir);
45
46#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink) \
47 SE = st.GetEntry(ptype, Dir+skew, sF); \
48 if (SE->_is_local ) { \
49 int perm= SE->_permute; \
50 chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\
51 } else if ( st.same_node[Dir] ) { \
52 chi = coalescedRead(buf[SE->_offset],lane); \
53 } \
54 if (SE->_is_local || st.same_node[Dir] ) { \
55 multLink(Uchi, U[sU], chi, Dir); \
56 }
57
58#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink) \
59 SE = st.GetEntry(ptype, Dir+skew, sF); \
60 if ((!SE->_is_local) && (!st.same_node[Dir]) ) { \
61 nmu++; \
62 chi = coalescedRead(buf[SE->_offset],lane); \
63 multLink(Uchi, U[sU], chi, Dir); \
64 }
65
66template <class Impl>
70// Generic implementation; move to different file?
71// Int, Ext, Int+Ext cases for comms overlap
73template <class Impl>
74template <int Naik> accelerator_inline
76 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
77 SiteSpinor *buf, int sF, int sU,
78 const FermionFieldView &in, FermionFieldView &out, int dag)
79{
80 typedef decltype(coalescedRead(in[0])) calcSpinor;
81 calcSpinor chi;
82 calcSpinor Uchi;
83 StencilEntry *SE;
84 int ptype;
85 int skew;
86 const int Nsimd = SiteHalfSpinor::Nsimd();
87 const int lane=acceleratorSIMTlane(Nsimd);
88
89 // for(int s=0;s<LLs;s++){
90 //
91 // int sF=LLs*sU+s;
92 {
93 skew = 0;
94 GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);
95 GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);
96 GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);
97 GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);
98 GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);
99 GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);
100 GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);
101 GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);
102 if ( Naik ) {
103 skew=8;
104 GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);
105 GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);
106 GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);
107 GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);
108 GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);
109 GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);
110 GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);
111 GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);
112 }
113 if ( dag ) {
114 Uchi = - Uchi;
115 }
116 coalescedWrite(out[sF], Uchi,lane);
117 }
118};
119
121 // Only contributions from interior of our node
123template <class Impl>
124template <int Naik> accelerator_inline
126 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
127 SiteSpinor *buf, int sF, int sU,
128 const FermionFieldView &in, FermionFieldView &out,int dag)
129{
130 typedef decltype(coalescedRead(in[0])) calcSpinor;
131 calcSpinor chi;
132 calcSpinor Uchi;
133 StencilEntry *SE;
134 int ptype;
135 int skew ;
136 const int Nsimd = SiteHalfSpinor::Nsimd();
137 const int lane=acceleratorSIMTlane(Nsimd);
138
139 // for(int s=0;s<LLs;s++){
140 // int sF=LLs*sU+s;
141 {
142 skew = 0;
143 Uchi=Zero();
144 GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);
145 GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);
146 GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);
147 GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);
148 GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);
149 GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);
150 GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);
151 GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);
152 if ( Naik ) {
153 skew=8;
154 GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);
155 GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);
156 GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);
157 GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);
158 GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);
159 GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);
160 GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);
161 GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);
162 }
163 if ( dag ) {
164 Uchi = - Uchi;
165 }
166 coalescedWrite(out[sF], Uchi,lane);
167 }
168};
169
170
172 // Only contributions from exterior of our node
174template <class Impl>
175template <int Naik> accelerator_inline
177 DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,
178 SiteSpinor *buf, int sF, int sU,
179 const FermionFieldView &in, FermionFieldView &out,int dag)
180{
181 typedef decltype(coalescedRead(in[0])) calcSpinor;
182 calcSpinor chi;
183 calcSpinor Uchi;
184 StencilEntry *SE;
185 int ptype;
186 int nmu=0;
187 int skew ;
188 const int Nsimd = SiteHalfSpinor::Nsimd();
189 const int lane=acceleratorSIMTlane(Nsimd);
190
191 // for(int s=0;s<LLs;s++){
192 // int sF=LLs*sU+s;
193 {
194 skew = 0;
195 Uchi=Zero();
196 GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);
197 GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);
198 GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);
199 GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);
200 GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);
201 GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);
202 GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);
203 GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);
204 if ( Naik ) {
205 skew=8;
206 GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);
207 GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);
208 GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);
209 GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);
210 GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);
211 GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);
212 GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);
213 GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);
214 }
215 if ( nmu ) {
216 auto _out = coalescedRead(out[sF],lane);
217 if ( dag ) {
218 coalescedWrite(out[sF], _out-Uchi,lane);
219 } else {
220 coalescedWrite(out[sF], _out+Uchi,lane);
221 }
222 }
223 }
224};
225
227// Driving / wrapping routine to select right kernel
229template <class Impl>
230void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,
231 int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)
232{
233 // Disp should be either +1,-1,+3,-3
234 // What about "dag" ?
235 // Because we work out pU . dS/dU
236 // U
237 assert(0);
238}
239
240#define KERNEL_CALLNB(A,improved) \
241 const uint64_t NN = Nsite*Ls; \
242 accelerator_forNB( ss, NN, Simd::Nsimd(), { \
243 int sF = ss; \
244 int sU = ss/Ls; \
245 ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
246 });
247
248#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();
249
250#define ASM_CALL(A) \
251 const uint64_t NN = Nsite*Ls; \
252 thread_for( ss, NN, { \
253 int sF = ss; \
254 int sU = ss/Ls; \
255 ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \
256 });
257
258template <class Impl>
260 DoubledGaugeField &U, DoubledGaugeField &UUU,
261 const FermionField &in, FermionField &out, int dag, int interior,int exterior)
262{
263 GridBase *FGrid=in.Grid();
264 GridBase *UGrid=U.Grid();
265 typedef StaggeredKernels<Impl> ThisKernel;
266 const int Nsimd = SiteHalfSpinor::Nsimd();
267 const int lane=acceleratorSIMTlane(Nsimd);
268 autoView( UUU_v , UUU, AcceleratorRead);
269 autoView( U_v , U, AcceleratorRead);
270 autoView( in_v , in, AcceleratorRead);
271 autoView( out_v , out, AcceleratorWrite);
272 autoView( st_v , st, AcceleratorRead);
273 SiteSpinor * buf = st.CommBuf();
274
275 int Ls=1;
276 if(FGrid->Nd()==UGrid->Nd()+1){
277 Ls = FGrid->_rdimensions[0];
278 }
279 int Nsite = UGrid->oSites();
280
281 if( interior && exterior ) {
282 if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,1); return;}
283 if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1); return;}
284#ifndef GRID_CUDA
285 if (Opt == OptInlineAsm ) { ASM_CALL(DhopSiteAsm); return;}
286#endif
287 } else if( interior ) {
288 if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}
289 if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1); return;}
290 } else if( exterior ) {
291 if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}
292 if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1); return;}
293 }
294 assert(0 && " Kernel optimisation case not covered ");
295}
296template <class Impl>
298 DoubledGaugeField &U,
299 const FermionField &in, FermionField &out, int dag, int interior,int exterior)
300{
301 GridBase *FGrid=in.Grid();
302 GridBase *UGrid=U.Grid();
303 typedef StaggeredKernels<Impl> ThisKernel;
304 const int Nsimd = SiteHalfSpinor::Nsimd();
305 const int lane=acceleratorSIMTlane(Nsimd);
306 autoView( UUU_v , U, AcceleratorRead);
307 autoView( U_v , U, AcceleratorRead);
308 autoView( in_v , in, AcceleratorRead);
309 autoView( out_v , out, AcceleratorWrite);
310 autoView( st_v , st, AcceleratorRead);
311 SiteSpinor * buf = st.CommBuf();
312
313 int Ls=1;
314 if(FGrid->Nd()==UGrid->Nd()+1){
315 Ls = FGrid->_rdimensions[0];
316 }
317 int Nsite = UGrid->oSites();
318
319 if( interior && exterior ) {
320 if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGeneric,0); return;}
321 if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0); return;}
322 } else if( interior ) {
323 if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}
324 if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0); return;}
325 } else if( exterior ) {
326 if (Opt == OptGeneric ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}
327 if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0); return;}
328 }
329}
330
331
332#undef KERNEL_CALLNB
333#undef KERNEL_CALL
334#undef ASM_CALL
335
337
338
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
#define autoView(l_v, l, mode)
@ AcceleratorRead
@ AcceleratorWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static constexpr int Xm
Definition QCD.h:45
static constexpr int Tm
Definition QCD.h:48
static constexpr int Tp
Definition QCD.h:44
static constexpr int Zp
Definition QCD.h:43
static constexpr int Zm
Definition QCD.h:47
static constexpr int Xp
Definition QCD.h:41
static constexpr int Yp
Definition QCD.h:42
static constexpr int Ym
Definition QCD.h:46
#define GENERIC_STENCIL_LEG_EXT(U, Dir, skew, multLink)
#define GENERIC_STENCIL_LEG_INT(U, Dir, skew, multLink)
#define ASM_CALL(A)
#define KERNEL_CALL(A, improved)
#define GENERIC_STENCIL_LEG(U, Dir, skew, multLink)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61
static INTERNAL_PRECISION U
Definition Zolotarev.cc:230
int oSites(void) const
Coordinate _rdimensions
int Nd(void) const
void DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior, int exterior)
void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
StaggeredKernels(const ImplParams &p=ImplParams())
void DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior, int exterior)
static accelerator_inline void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
static accelerator_inline void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
FermionOperator< Impl > Base
static accelerator_inline void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition Simd.h:194