Grid 0.7.0
CayleyFermion5Dvec.h
Go to the documentation of this file.
1#if 0
2
3/*************************************************************************************
4
5 Grid physics library, www.github.com/paboyle/Grid
6
7 Source file: ./lib/qcd/action/fermion/CayleyFermion5D.cc
8
9 Copyright (C) 2015
10
11Author: Peter Boyle <pabobyle@ph.ed.ac.uk>
12Author: Peter Boyle <paboyle@ph.ed.ac.uk>
13Author: Peter Boyle <peterboyle@Peters-MacBook-Pro-2.local>
14Author: paboyle <paboyle@ph.ed.ac.uk>
15
16 This program is free software; you can redistribute it and/or modify
17 it under the terms of the GNU General Public License as published by
18 the Free Software Foundation; either version 2 of the License, or
19 (at your option) any later version.
20
21 This program is distributed in the hope that it will be useful,
22 but WITHOUT ANY WARRANTY; without even the implied warranty of
23 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
24 GNU General Public License for more details.
25
26 You should have received a copy of the GNU General Public License along
27 with this program; if not, write to the Free Software Foundation, Inc.,
28 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
29
30 See the full license in the file "LICENSE" in the top level distribution directory
31*************************************************************************************/
32/* END LEGAL */
33
34
37
39
40/*
41 * Dense matrix versions of routines
42 */
43template<class Impl>
44void
45CayleyFermion5D<Impl>::MooeeInvDag(const FermionField &psi, FermionField &chi)
46{
48 this->MooeeInternal(psi,chi,DaggerYes,InverseYes);
49}
50
51template<class Impl>
52void
53CayleyFermion5D<Impl>::MooeeInv(const FermionField &psi, FermionField &chi)
54{
56 this->MooeeInternal(psi,chi,DaggerNo,InverseYes);
57}
58template<class Impl>
59void
60CayleyFermion5D<Impl>::M5D(const FermionField &psi_i,
61 const FermionField &phi_i,
62 FermionField &chi_i,
63 Vector<Coeff_t> &lower,
64 Vector<Coeff_t> &diag,
65 Vector<Coeff_t> &upper)
66{
68 chi_i.Checkerboard()=psi_i.Checkerboard();
69 GridBase *grid=psi_i.Grid();
70 autoView(psi, psi_i,CpuRead);
71 autoView(phi, phi_i,CpuRead);
72 autoView(chi, chi_i,CpuWrite);
73 int Ls = this->Ls;
74 int LLs = grid->_rdimensions[0];
75 const int nsimd= Simd::Nsimd();
76
77 Vector<iSinglet<Simd> > u(LLs);
78 Vector<iSinglet<Simd> > l(LLs);
79 Vector<iSinglet<Simd> > d(LLs);
80
81 assert(Ls/LLs==nsimd);
82 assert(phi.Checkerboard() == psi.Checkerboard());
83
84 // just directly address via type pun
85 typedef typename Simd::scalar_type scalar_type;
86 scalar_type * u_p = (scalar_type *)&u[0];
87 scalar_type * l_p = (scalar_type *)&l[0];
88 scalar_type * d_p = (scalar_type *)&d[0];
89
90 for(int o=0;o<LLs;o++){ // outer
91 for(int i=0;i<nsimd;i++){ //inner
92 int s = o+i*LLs;
93 int ss = o*nsimd+i;
94 u_p[ss] = upper[s];
95 l_p[ss] = lower[s];
96 d_p[ss] = diag[s];
97 }}
98
99 assert(Nc==3);
100
101 thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
102#if 0
103 alignas(64) SiteHalfSpinor hp;
104 alignas(64) SiteHalfSpinor hm;
105 alignas(64) SiteSpinor fp;
106 alignas(64) SiteSpinor fm;
107
108 for(int v=0;v<LLs;v++){
109
110 int vp=(v+1)%LLs;
111 int vm=(v+LLs-1)%LLs;
112
113 spProj5m(hp,psi[ss+vp]);
114 spProj5p(hm,psi[ss+vm]);
115
116 if ( vp<=v ) rotate(hp,hp,1);
117 if ( vm>=v ) rotate(hm,hm,nsimd-1);
118
119 hp=0.5*hp;
120 hm=0.5*hm;
121
122 spRecon5m(fp,hp);
123 spRecon5p(fm,hm);
124
125 chi[ss+v] = d[v]*phi[ss+v];
126 chi[ss+v] = chi[ss+v] +u[v]*fp;
127 chi[ss+v] = chi[ss+v] +l[v]*fm;
128
129 }
130#else
131 for(int v=0;v<LLs;v++){
132
133 vprefetch(psi[ss+v+LLs]);
134
135 int vp= (v==LLs-1) ? 0 : v+1;
136 int vm= (v==0 ) ? LLs-1 : v-1;
137
138 Simd hp_00 = psi[ss+vp]()(2)(0);
139 Simd hp_01 = psi[ss+vp]()(2)(1);
140 Simd hp_02 = psi[ss+vp]()(2)(2);
141 Simd hp_10 = psi[ss+vp]()(3)(0);
142 Simd hp_11 = psi[ss+vp]()(3)(1);
143 Simd hp_12 = psi[ss+vp]()(3)(2);
144
145 Simd hm_00 = psi[ss+vm]()(0)(0);
146 Simd hm_01 = psi[ss+vm]()(0)(1);
147 Simd hm_02 = psi[ss+vm]()(0)(2);
148 Simd hm_10 = psi[ss+vm]()(1)(0);
149 Simd hm_11 = psi[ss+vm]()(1)(1);
150 Simd hm_12 = psi[ss+vm]()(1)(2);
151
152 if ( vp<=v ) {
153 hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
154 hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
155 hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
156 hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
157 hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
158 hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
159 }
160 if ( vm>=v ) {
161 hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
162 hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
163 hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
164 hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
165 hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
166 hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
167 }
168
169 // Can force these to real arithmetic and save 2x.
170 Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
171 Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
172 Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
173 Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
174 Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
175 Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
176 Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
177 Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
178 Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
179 Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
180 Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
181 Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
182
183 vstream(chi[ss+v]()(0)(0),p_00);
184 vstream(chi[ss+v]()(0)(1),p_01);
185 vstream(chi[ss+v]()(0)(2),p_02);
186 vstream(chi[ss+v]()(1)(0),p_10);
187 vstream(chi[ss+v]()(1)(1),p_11);
188 vstream(chi[ss+v]()(1)(2),p_12);
189 vstream(chi[ss+v]()(2)(0),p_20);
190 vstream(chi[ss+v]()(2)(1),p_21);
191 vstream(chi[ss+v]()(2)(2),p_22);
192 vstream(chi[ss+v]()(3)(0),p_30);
193 vstream(chi[ss+v]()(3)(1),p_31);
194 vstream(chi[ss+v]()(3)(2),p_32);
195
196 }
197#endif
198 });
199}
200
201template<class Impl>
202void
203CayleyFermion5D<Impl>::M5Ddag(const FermionField &psi_i,
204 const FermionField &phi_i,
205 FermionField &chi_i,
206 Vector<Coeff_t> &lower,
207 Vector<Coeff_t> &diag,
208 Vector<Coeff_t> &upper)
209{
211 chi_i.Checkerboard()=psi_i.Checkerboard();
212 GridBase *grid=psi_i.Grid();
213 autoView(psi,psi_i,CpuRead);
214 autoView(phi,phi_i,CpuRead);
215 autoView(chi,chi_i,CpuWrite);
216 int Ls = this->Ls;
217 int LLs = grid->_rdimensions[0];
218 int nsimd= Simd::Nsimd();
219
220 Vector<iSinglet<Simd> > u(LLs);
221 Vector<iSinglet<Simd> > l(LLs);
222 Vector<iSinglet<Simd> > d(LLs);
223
224 assert(Ls/LLs==nsimd);
225 assert(phi.Checkerboard() == psi.Checkerboard());
226
227 // just directly address via type pun
228 typedef typename Simd::scalar_type scalar_type;
229 scalar_type * u_p = (scalar_type *)&u[0];
230 scalar_type * l_p = (scalar_type *)&l[0];
231 scalar_type * d_p = (scalar_type *)&d[0];
232
233 for(int o=0;o<LLs;o++){ // outer
234 for(int i=0;i<nsimd;i++){ //inner
235 int s = o+i*LLs;
236 int ss = o*nsimd+i;
237 u_p[ss] = upper[s];
238 l_p[ss] = lower[s];
239 d_p[ss] = diag[s];
240 }}
241
242 thread_loop( (int ss=0;ss<grid->oSites();ss+=LLs),{ // adds LLs
243#if 0
244 alignas(64) SiteHalfSpinor hp;
245 alignas(64) SiteHalfSpinor hm;
246 alignas(64) SiteSpinor fp;
247 alignas(64) SiteSpinor fm;
248
249 for(int v=0;v<LLs;v++){
250
251 int vp=(v+1)%LLs;
252 int vm=(v+LLs-1)%LLs;
253
254 spProj5p(hp,psi[ss+vp]);
255 spProj5m(hm,psi[ss+vm]);
256
257 if ( vp<=v ) rotate(hp,hp,1);
258 if ( vm>=v ) rotate(hm,hm,nsimd-1);
259
260 hp=hp*0.5;
261 hm=hm*0.5;
262 spRecon5p(fp,hp);
263 spRecon5m(fm,hm);
264
265 chi[ss+v] = d[v]*phi[ss+v]+u[v]*fp;
266 chi[ss+v] = chi[ss+v] +l[v]*fm;
267
268 }
269#else
270 for(int v=0;v<LLs;v++){
271
272 vprefetch(psi[ss+v+LLs]);
273
274 int vp= (v==LLs-1) ? 0 : v+1;
275 int vm= (v==0 ) ? LLs-1 : v-1;
276
277 Simd hp_00 = psi[ss+vp]()(0)(0);
278 Simd hp_01 = psi[ss+vp]()(0)(1);
279 Simd hp_02 = psi[ss+vp]()(0)(2);
280 Simd hp_10 = psi[ss+vp]()(1)(0);
281 Simd hp_11 = psi[ss+vp]()(1)(1);
282 Simd hp_12 = psi[ss+vp]()(1)(2);
283
284 Simd hm_00 = psi[ss+vm]()(2)(0);
285 Simd hm_01 = psi[ss+vm]()(2)(1);
286 Simd hm_02 = psi[ss+vm]()(2)(2);
287 Simd hm_10 = psi[ss+vm]()(3)(0);
288 Simd hm_11 = psi[ss+vm]()(3)(1);
289 Simd hm_12 = psi[ss+vm]()(3)(2);
290
291 if ( vp<=v ) {
292 hp_00.v = Optimization::Rotate::tRotate<2>(hp_00.v);
293 hp_01.v = Optimization::Rotate::tRotate<2>(hp_01.v);
294 hp_02.v = Optimization::Rotate::tRotate<2>(hp_02.v);
295 hp_10.v = Optimization::Rotate::tRotate<2>(hp_10.v);
296 hp_11.v = Optimization::Rotate::tRotate<2>(hp_11.v);
297 hp_12.v = Optimization::Rotate::tRotate<2>(hp_12.v);
298 }
299 if ( vm>=v ) {
300 hm_00.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_00.v);
301 hm_01.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_01.v);
302 hm_02.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_02.v);
303 hm_10.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_10.v);
304 hm_11.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_11.v);
305 hm_12.v = Optimization::Rotate::tRotate<2*Simd::Nsimd()-2>(hm_12.v);
306 }
307
308 Simd p_00 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_00);
309 Simd p_01 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_01);
310 Simd p_02 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(0)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_02);
311 Simd p_10 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(0)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_10);
312 Simd p_11 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(1)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_11);
313 Simd p_12 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(1)(2)) + switcheroo<Coeff_t>::mult(u[v]()()(),hp_12);
314
315 Simd p_20 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_00);
316 Simd p_21 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_01);
317 Simd p_22 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(2)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_02);
318 Simd p_30 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(0)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_10);
319 Simd p_31 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(1)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_11);
320 Simd p_32 = switcheroo<Coeff_t>::mult(d[v]()()(), phi[ss+v]()(3)(2)) + switcheroo<Coeff_t>::mult(l[v]()()(),hm_12);
321
322 vstream(chi[ss+v]()(0)(0),p_00);
323 vstream(chi[ss+v]()(0)(1),p_01);
324 vstream(chi[ss+v]()(0)(2),p_02);
325 vstream(chi[ss+v]()(1)(0),p_10);
326 vstream(chi[ss+v]()(1)(1),p_11);
327 vstream(chi[ss+v]()(1)(2),p_12);
328 vstream(chi[ss+v]()(2)(0),p_20);
329 vstream(chi[ss+v]()(2)(1),p_21);
330 vstream(chi[ss+v]()(2)(2),p_22);
331 vstream(chi[ss+v]()(3)(0),p_30);
332 vstream(chi[ss+v]()(3)(1),p_31);
333 vstream(chi[ss+v]()(3)(2),p_32);
334 }
335#endif
336 });
337}
338
339
340#ifdef AVX512
341#include <simd/Intel512common.h>
342#include <simd/Intel512avx.h>
343#include <simd/Intel512single.h>
344#endif
345
346template<class Impl>
347void
348CayleyFermion5D<Impl>::MooeeInternalAsm(const FermionField &psi_i, FermionField &chi_i,
349 int LLs, int site,
350 Vector<iSinglet<Simd> > &Matp,
351 Vector<iSinglet<Simd> > &Matm)
352{
354 autoView(psi , psi_i,CpuRead);
355 autoView(chi , chi_i,CpuWrite);
356#ifndef AVX512
357 {
358 SiteHalfSpinor BcastP;
359 SiteHalfSpinor BcastM;
360 SiteHalfSpinor SiteChiP;
361 SiteHalfSpinor SiteChiM;
362
363 // Ls*Ls * 2 * 12 * vol flops
364 for(int s1=0;s1<LLs;s1++){
365 for(int s2=0;s2<LLs;s2++){
366 for(int l=0; l<Simd::Nsimd();l++){ // simd lane
367
368 int s=s2+l*LLs;
369 int lex=s2+LLs*site;
370
371 if ( s2==0 && l==0) {
372 SiteChiP=Zero();
373 SiteChiM=Zero();
374 }
375
376 for(int sp=0;sp<2;sp++){
377 for(int co=0;co<Nc;co++){
378 vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
379 }}
380 for(int sp=0;sp<2;sp++){
381 for(int co=0;co<Nc;co++){
382 vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
383 }}
384
385 for(int sp=0;sp<2;sp++){
386 for(int co=0;co<Nc;co++){
387 SiteChiP()(sp)(co)=real_madd(Matp[LLs*s+s1]()()(),BcastP()(sp)(co),SiteChiP()(sp)(co)); // 1100 us.
388 SiteChiM()(sp)(co)=real_madd(Matm[LLs*s+s1]()()(),BcastM()(sp)(co),SiteChiM()(sp)(co)); // each found by commenting out
389 }}
390
391 }}
392 {
393 int lex = s1+LLs*site;
394 for(int sp=0;sp<2;sp++){
395 for(int co=0;co<Nc;co++){
396 vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
397 vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
398 }}
399 }
400 }
401
402 }
403#else
404 {
405 // pointers
406 // MASK_REGS;
407#define Chi_00 %%zmm1
408#define Chi_01 %%zmm2
409#define Chi_02 %%zmm3
410#define Chi_10 %%zmm4
411#define Chi_11 %%zmm5
412#define Chi_12 %%zmm6
413#define Chi_20 %%zmm7
414#define Chi_21 %%zmm8
415#define Chi_22 %%zmm9
416#define Chi_30 %%zmm10
417#define Chi_31 %%zmm11
418#define Chi_32 %%zmm12
419
420#define BCAST0 %%zmm13
421#define BCAST1 %%zmm14
422#define BCAST2 %%zmm15
423#define BCAST3 %%zmm16
424#define BCAST4 %%zmm17
425#define BCAST5 %%zmm18
426#define BCAST6 %%zmm19
427#define BCAST7 %%zmm20
428#define BCAST8 %%zmm21
429#define BCAST9 %%zmm22
430#define BCAST10 %%zmm23
431#define BCAST11 %%zmm24
432
433 int incr=LLs*LLs*sizeof(iSinglet<Simd>);
434 for(int s1=0;s1<LLs;s1++){
435 for(int s2=0;s2<LLs;s2++){
436 int lex=s2+LLs*site;
437 uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
438 uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
439 uint64_t a2 = (uint64_t)&psi[lex];
440 for(int l=0; l<Simd::Nsimd();l++){ // simd lane
441 if ( (s2+l)==0 ) {
442 asm (
443 VPREFETCH1(0,%2) VPREFETCH1(0,%1)
444 VPREFETCH1(12,%2) VPREFETCH1(13,%2)
445 VPREFETCH1(14,%2) VPREFETCH1(15,%2)
446 VBCASTCDUP(0,%2,BCAST0)
447 VBCASTCDUP(1,%2,BCAST1)
448 VBCASTCDUP(2,%2,BCAST2)
449 VBCASTCDUP(3,%2,BCAST3)
450 VBCASTCDUP(4,%2,BCAST4) VMULMEM (0,%0,BCAST0,Chi_00)
451 VBCASTCDUP(5,%2,BCAST5) VMULMEM (0,%0,BCAST1,Chi_01)
452 VBCASTCDUP(6,%2,BCAST6) VMULMEM (0,%0,BCAST2,Chi_02)
453 VBCASTCDUP(7,%2,BCAST7) VMULMEM (0,%0,BCAST3,Chi_10)
454 VBCASTCDUP(8,%2,BCAST8) VMULMEM (0,%0,BCAST4,Chi_11)
455 VBCASTCDUP(9,%2,BCAST9) VMULMEM (0,%0,BCAST5,Chi_12)
456 VBCASTCDUP(10,%2,BCAST10) VMULMEM (0,%1,BCAST6,Chi_20)
457 VBCASTCDUP(11,%2,BCAST11) VMULMEM (0,%1,BCAST7,Chi_21)
458 VMULMEM (0,%1,BCAST8,Chi_22)
459 VMULMEM (0,%1,BCAST9,Chi_30)
460 VMULMEM (0,%1,BCAST10,Chi_31)
461 VMULMEM (0,%1,BCAST11,Chi_32)
462 : : "r" (a0), "r" (a1), "r" (a2) );
463 } else {
464 asm (
465 VBCASTCDUP(0,%2,BCAST0) VMADDMEM (0,%0,BCAST0,Chi_00)
466 VBCASTCDUP(1,%2,BCAST1) VMADDMEM (0,%0,BCAST1,Chi_01)
467 VBCASTCDUP(2,%2,BCAST2) VMADDMEM (0,%0,BCAST2,Chi_02)
468 VBCASTCDUP(3,%2,BCAST3) VMADDMEM (0,%0,BCAST3,Chi_10)
469 VBCASTCDUP(4,%2,BCAST4) VMADDMEM (0,%0,BCAST4,Chi_11)
470 VBCASTCDUP(5,%2,BCAST5) VMADDMEM (0,%0,BCAST5,Chi_12)
471 VBCASTCDUP(6,%2,BCAST6) VMADDMEM (0,%1,BCAST6,Chi_20)
472 VBCASTCDUP(7,%2,BCAST7) VMADDMEM (0,%1,BCAST7,Chi_21)
473 VBCASTCDUP(8,%2,BCAST8) VMADDMEM (0,%1,BCAST8,Chi_22)
474 VBCASTCDUP(9,%2,BCAST9) VMADDMEM (0,%1,BCAST9,Chi_30)
475 VBCASTCDUP(10,%2,BCAST10) VMADDMEM (0,%1,BCAST10,Chi_31)
476 VBCASTCDUP(11,%2,BCAST11) VMADDMEM (0,%1,BCAST11,Chi_32)
477 : : "r" (a0), "r" (a1), "r" (a2) );
478 }
479 a0 = a0+incr;
480 a1 = a1+incr;
481 a2 = a2+sizeof(typename Simd::scalar_type);
482 }}
483 {
484 int lexa = s1+LLs*site;
485 asm (
486 VSTORE(0,%0,Chi_00) VSTORE(1 ,%0,Chi_01) VSTORE(2 ,%0,Chi_02)
487 VSTORE(3,%0,Chi_10) VSTORE(4 ,%0,Chi_11) VSTORE(5 ,%0,Chi_12)
488 VSTORE(6,%0,Chi_20) VSTORE(7 ,%0,Chi_21) VSTORE(8 ,%0,Chi_22)
489 VSTORE(9,%0,Chi_30) VSTORE(10,%0,Chi_31) VSTORE(11,%0,Chi_32)
490 : : "r" ((uint64_t)&chi[lexa]) : "memory" );
491
492 }
493 }
494 }
495#undef Chi_00
496#undef Chi_01
497#undef Chi_02
498#undef Chi_10
499#undef Chi_11
500#undef Chi_12
501#undef Chi_20
502#undef Chi_21
503#undef Chi_22
504#undef Chi_30
505#undef Chi_31
506#undef Chi_32
507
508#undef BCAST0
509#undef BCAST1
510#undef BCAST2
511#undef BCAST3
512#undef BCAST4
513#undef BCAST5
514#undef BCAST6
515#undef BCAST7
516#undef BCAST8
517#undef BCAST9
518#undef BCAST10
519#undef BCAST11
520#endif
521};
522
523// Z-mobius version
524template<class Impl>
525void
526CayleyFermion5D<Impl>::MooeeInternalZAsm(const FermionField &psi_i, FermionField &chi_i,
527 int LLs, int site, Vector<iSinglet<Simd> > &Matp, Vector<iSinglet<Simd> > &Matm)
528{
530#ifndef AVX512
531 {
532 autoView(psi , psi_i,CpuRead);
533 autoView(chi , chi_i,CpuWrite);
534
535 SiteHalfSpinor BcastP;
536 SiteHalfSpinor BcastM;
537 SiteHalfSpinor SiteChiP;
538 SiteHalfSpinor SiteChiM;
539
540 // Ls*Ls * 2 * 12 * vol flops
541 for(int s1=0;s1<LLs;s1++){
542 for(int s2=0;s2<LLs;s2++){
543 for(int l=0; l<Simd::Nsimd();l++){ // simd lane
544
545 int s=s2+l*LLs;
546 int lex=s2+LLs*site;
547
548 if ( s2==0 && l==0) {
549 SiteChiP=Zero();
550 SiteChiM=Zero();
551 }
552
553 for(int sp=0;sp<2;sp++){
554 for(int co=0;co<Nc;co++){
555 vbroadcast(BcastP()(sp )(co),psi[lex]()(sp)(co),l);
556 }}
557 for(int sp=0;sp<2;sp++){
558 for(int co=0;co<Nc;co++){
559 vbroadcast(BcastM()(sp )(co),psi[lex]()(sp+2)(co),l);
560 }}
561
562 for(int sp=0;sp<2;sp++){
563 for(int co=0;co<Nc;co++){
564 SiteChiP()(sp)(co)=SiteChiP()(sp)(co)+ Matp[LLs*s+s1]()()()*BcastP()(sp)(co);
565 SiteChiM()(sp)(co)=SiteChiM()(sp)(co)+ Matm[LLs*s+s1]()()()*BcastM()(sp)(co);
566 }}
567
568
569 }}
570 {
571 int lex = s1+LLs*site;
572 for(int sp=0;sp<2;sp++){
573 for(int co=0;co<Nc;co++){
574 vstream(chi[lex]()(sp)(co), SiteChiP()(sp)(co));
575 vstream(chi[lex]()(sp+2)(co), SiteChiM()(sp)(co));
576 }}
577 }
578 }
579
580 }
581#else
582 {
583 autoView(psi , psi_i,CpuRead);
584 autoView(chi , chi_i,CpuWrite);
585 // pointers
586 // MASK_REGS;
587#define Chi_00 %zmm0
588#define Chi_01 %zmm1
589#define Chi_02 %zmm2
590#define Chi_10 %zmm3
591#define Chi_11 %zmm4
592#define Chi_12 %zmm5
593#define Chi_20 %zmm6
594#define Chi_21 %zmm7
595#define Chi_22 %zmm8
596#define Chi_30 %zmm9
597#define Chi_31 %zmm10
598#define Chi_32 %zmm11
599#define pChi_00 %%zmm0
600#define pChi_01 %%zmm1
601#define pChi_02 %%zmm2
602#define pChi_10 %%zmm3
603#define pChi_11 %%zmm4
604#define pChi_12 %%zmm5
605#define pChi_20 %%zmm6
606#define pChi_21 %%zmm7
607#define pChi_22 %%zmm8
608#define pChi_30 %%zmm9
609#define pChi_31 %%zmm10
610#define pChi_32 %%zmm11
611
612#define BCAST_00 %zmm12
613#define SHUF_00 %zmm13
614#define BCAST_01 %zmm14
615#define SHUF_01 %zmm15
616#define BCAST_02 %zmm16
617#define SHUF_02 %zmm17
618#define BCAST_10 %zmm18
619#define SHUF_10 %zmm19
620#define BCAST_11 %zmm20
621#define SHUF_11 %zmm21
622#define BCAST_12 %zmm22
623#define SHUF_12 %zmm23
624
625#define Mp %zmm24
626#define Mps %zmm25
627#define Mm %zmm26
628#define Mms %zmm27
629#define N 8
630 int incr=LLs*LLs*sizeof(iSinglet<Simd>);
631 for(int s1=0;s1<LLs;s1++){
632 for(int s2=0;s2<LLs;s2++){
633 int lex=s2+LLs*site;
634 uint64_t a0 = (uint64_t)&Matp[LLs*s2+s1]; // should be cacheable
635 uint64_t a1 = (uint64_t)&Matm[LLs*s2+s1];
636 uint64_t a2 = (uint64_t)&psi[lex];
637 for(int l=0; l<Simd::Nsimd();l++){ // simd lane
638 if ( (s2+l)==0 ) {
639 LOAD64(%r8,a0);
640 LOAD64(%r9,a1);
641 LOAD64(%r10,a2);
642 asm (
643 VLOAD(0,%r8,Mp)// i r
644 VLOAD(0,%r9,Mm)
645 VSHUF(Mp,Mps) // r i
646 VSHUF(Mm,Mms)
647 VPREFETCH1(12,%r10) VPREFETCH1(13,%r10)
648 VPREFETCH1(14,%r10) VPREFETCH1(15,%r10)
649
650 VMULIDUP(0*N,%r10,Mps,Chi_00)
651 VMULIDUP(1*N,%r10,Mps,Chi_01)
652 VMULIDUP(2*N,%r10,Mps,Chi_02)
653 VMULIDUP(3*N,%r10,Mps,Chi_10)
654 VMULIDUP(4*N,%r10,Mps,Chi_11)
655 VMULIDUP(5*N,%r10,Mps,Chi_12)
656
657 VMULIDUP(6*N ,%r10,Mms,Chi_20)
658 VMULIDUP(7*N ,%r10,Mms,Chi_21)
659 VMULIDUP(8*N ,%r10,Mms,Chi_22)
660 VMULIDUP(9*N ,%r10,Mms,Chi_30)
661 VMULIDUP(10*N,%r10,Mms,Chi_31)
662 VMULIDUP(11*N,%r10,Mms,Chi_32)
663
664 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00)
665 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01)
666 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
667 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
668 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
669 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
670
671 VMADDSUBRDUP(6*N ,%r10,Mm,Chi_20)
672 VMADDSUBRDUP(7*N ,%r10,Mm,Chi_21)
673 VMADDSUBRDUP(8*N ,%r10,Mm,Chi_22)
674 VMADDSUBRDUP(9*N ,%r10,Mm,Chi_30)
675 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
676 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
677 );
678 } else {
679 LOAD64(%r8,a0);
680 LOAD64(%r9,a1);
681 LOAD64(%r10,a2);
682 asm (
683 VLOAD(0,%r8,Mp)
684 VSHUF(Mp,Mps)
685
686 VLOAD(0,%r9,Mm)
687 VSHUF(Mm,Mms)
688
689 VMADDSUBIDUP(0*N,%r10,Mps,Chi_00) // Mri * Pii +- Cir
690 VMADDSUBIDUP(1*N,%r10,Mps,Chi_01)
691 VMADDSUBIDUP(2*N,%r10,Mps,Chi_02)
692 VMADDSUBIDUP(3*N,%r10,Mps,Chi_10)
693 VMADDSUBIDUP(4*N,%r10,Mps,Chi_11)
694 VMADDSUBIDUP(5*N,%r10,Mps,Chi_12)
695
696 VMADDSUBIDUP(6 *N,%r10,Mms,Chi_20)
697 VMADDSUBIDUP(7 *N,%r10,Mms,Chi_21)
698 VMADDSUBIDUP(8 *N,%r10,Mms,Chi_22)
699 VMADDSUBIDUP(9 *N,%r10,Mms,Chi_30)
700 VMADDSUBIDUP(10*N,%r10,Mms,Chi_31)
701 VMADDSUBIDUP(11*N,%r10,Mms,Chi_32)
702
703 VMADDSUBRDUP(0*N,%r10,Mp,Chi_00) // Cir = Mir * Prr +- ( Mri * Pii +- Cir)
704 VMADDSUBRDUP(1*N,%r10,Mp,Chi_01) // Ci = MiPr + Ci + MrPi ; Cr = MrPr - ( MiPi - Cr)
705 VMADDSUBRDUP(2*N,%r10,Mp,Chi_02)
706 VMADDSUBRDUP(3*N,%r10,Mp,Chi_10)
707 VMADDSUBRDUP(4*N,%r10,Mp,Chi_11)
708 VMADDSUBRDUP(5*N,%r10,Mp,Chi_12)
709
710 VMADDSUBRDUP(6 *N,%r10,Mm,Chi_20)
711 VMADDSUBRDUP(7 *N,%r10,Mm,Chi_21)
712 VMADDSUBRDUP(8 *N,%r10,Mm,Chi_22)
713 VMADDSUBRDUP(9 *N,%r10,Mm,Chi_30)
714 VMADDSUBRDUP(10*N,%r10,Mm,Chi_31)
715 VMADDSUBRDUP(11*N,%r10,Mm,Chi_32)
716 );
717 }
718 a0 = a0+incr;
719 a1 = a1+incr;
720 a2 = a2+sizeof(typename Simd::scalar_type);
721 }}
722 {
723 int lexa = s1+LLs*site;
724 /*
725 SiteSpinor tmp;
726 asm (
727 VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
728 VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
729 VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
730 VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
731 : : "r" ((uint64_t)&tmp) : "memory" );
732 */
733
734 asm (
735 VSTORE(0,%0,pChi_00) VSTORE(1 ,%0,pChi_01) VSTORE(2 ,%0,pChi_02)
736 VSTORE(3,%0,pChi_10) VSTORE(4 ,%0,pChi_11) VSTORE(5 ,%0,pChi_12)
737 VSTORE(6,%0,pChi_20) VSTORE(7 ,%0,pChi_21) VSTORE(8 ,%0,pChi_22)
738 VSTORE(9,%0,pChi_30) VSTORE(10,%0,pChi_31) VSTORE(11,%0,pChi_32)
739 : : "r" ((uint64_t)&chi[lexa]) : "memory" );
740
741 // if ( 1 || (site==0) ) {
742 // std::cout<<site << " s1 "<<s1<<"\n\t"<<tmp << "\n't" << chi[lexa] <<"\n\t"<<tmp-chi[lexa]<<std::endl;
743 // }
744 }
745 }
746 }
747#undef Chi_00
748#undef Chi_01
749#undef Chi_02
750#undef Chi_10
751#undef Chi_11
752#undef Chi_12
753#undef Chi_20
754#undef Chi_21
755#undef Chi_22
756#undef Chi_30
757#undef Chi_31
758#undef Chi_32
759
760#undef BCAST0
761#undef BCAST1
762#undef BCAST2
763#undef BCAST3
764#undef BCAST4
765#undef BCAST5
766#undef BCAST6
767#undef BCAST7
768#undef BCAST8
769#undef BCAST9
770#undef BCAST10
771#undef BCAST11
772
773#endif
774};
775
776
777template<class Impl>
778void
779CayleyFermion5D<Impl>::MooeeInternal(const FermionField &psi, FermionField &chi,int dag, int inv)
780{
782 chi.Checkerboard()=psi.Checkerboard();
783
784 int Ls=this->Ls;
785 int LLs = psi.Grid()->_rdimensions[0];
786 int vol = psi.Grid()->oSites()/LLs;
787
788
791 Vector<iSinglet<Simd> > *_Matp;
792 Vector<iSinglet<Simd> > *_Matm;
793
794 // MooeeInternalCompute(dag,inv,Matp,Matm);
795 if ( inv && dag ) {
796 _Matp = &MatpInvDag;
797 _Matm = &MatmInvDag;
798 }
799 if ( inv && (!dag) ) {
800 _Matp = &MatpInv;
801 _Matm = &MatmInv;
802 }
803 if ( !inv ) {
804 MooeeInternalCompute(dag,inv,Matp,Matm);
805 _Matp = &Matp;
806 _Matm = &Matm;
807 }
808 assert(_Matp->size()==Ls*LLs);
809
810 if ( switcheroo<Coeff_t>::iscomplex() ) {
811 thread_loop( (auto site=0;site<vol;site++),{
812 MooeeInternalZAsm(psi,chi,LLs,site,*_Matp,*_Matm);
813 });
814 } else {
815 thread_loop( (auto site=0;site<vol;site++),{
816 MooeeInternalAsm(psi,chi,LLs,site,*_Matp,*_Matm);
817 });
818 }
819
820}
821
823
824#endif
std::vector< T, uvmAllocator< T > > Vector
#define LOAD64(A, ptr)
Definition BGQQPX.h:43
accelerator_inline void vstream(Grid_simd2< S, V > &out, const Grid_simd2< S, V > &in)
accelerator_inline Grid_simd2< S, V > real_madd(Grid_simd2< S, V > a, Grid_simd2< S, V > b, Grid_simd2< S, V > c)
accelerator_inline void vbroadcast(Grid_simd2< S, V > &ret, const Grid_simd2< S, V > &src, int lane)
accelerator_inline Grid_simd< S, V > rotate(Grid_simd< S, V > b, int nrot)
Invoke< std::enable_if< Condition::value, ReturnType > > EnableIf
#define VLOAD(A, B, C)
#define VSTORE(A, B, C)
#define VBCASTCDUP(OFF, A, DEST)
#define VPREFETCH1(O, A)
#define VMULIDUP(O, P, B, accum)
#define VMADDSUBIDUP(O, P, B, accum)
#define VMULMEM(O, P, B, accum)
#define VSHUF(A, B)
#define VMADDSUBRDUP(O, P, B, accum)
#define VMADDMEM(O, P, B, accum)
#define autoView(l_v, l, mode)
@ CpuRead
@ CpuWrite
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static constexpr int DaggerYes
Definition QCD.h:70
static constexpr int Nc
Definition QCD.h:50
iScalar< iScalar< iScalar< vtype > > > iSinglet
Definition QCD.h:102
static constexpr int DaggerNo
Definition QCD.h:69
static constexpr int InverseYes
Definition QCD.h:72
#define Chi_11
#define pChi_31
#define pChi_20
#define pChi_12
#define pChi_01
#define pChi_21
#define Chi_30
#define pChi_30
#define pChi_11
#define pChi_22
#define Chi_01
#define Chi_22
#define pChi_02
#define Chi_20
#define Chi_02
#define pChi_10
#define Chi_31
#define Chi_10
#define pChi_00
#define Chi_32
#define Chi_12
#define Chi_00
#define Chi_21
#define pChi_32
accelerator_inline void vprefetch(const iScalar< v > &vv)
accelerator_inline void spRecon5m(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
Definition TwoSpinor.h:337
accelerator_inline void spProj5m(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
Definition TwoSpinor.h:146
accelerator_inline void spProj5p(iVector< vtype, Nhs > &hspin, const iVector< vtype, Ns > &fspin)
Definition TwoSpinor.h:140
accelerator_inline void spRecon5p(iVector< vtype, Ns > &fspin, const iVector< vtype, Nhs > &hspin)
Definition TwoSpinor.h:330
virtual void MooeeInvDag(const FermionField &in, FermionField &out)
virtual void M5Ddag(const FermionField &psi, FermionField &chi)
virtual void M5D(const FermionField &psi, FermionField &chi)
virtual void MooeeInv(const FermionField &in, FermionField &out)
int oSites(void) const
Coordinate _rdimensions
Definition Simd.h:194