Grid/dev/StaggeredKernelsImplementation_8h_source.html

/*************************************************************************************


Grid physics library, www.github.com/paboyle/Grid


Source file: ./lib/qcd/action/fermion/WilsonKernels.cc


Copyright (C) 2015


Author: Azusa Yamaguchi, Peter Boyle


This program is free software; you can redistribute it and/or modify

it under the terms of the GNU General Public License as published by

the Free Software Foundation; either version 2 of the License, or

(at your option) any later version.


This program is distributed in the hope that it will be useful,

but WITHOUT ANY WARRANTY; without even the implied warranty of

MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

GNU General Public License for more details.


You should have received a copy of the GNU General Public License along

with this program; if not, write to the Free Software Foundation, Inc.,

51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


See the full license in the file "LICENSE" in the top level distribution

directory

*************************************************************************************/

/*  END LEGAL */

#include <Grid/qcd/action/fermion/FermionCore.h>


#pragma once


NAMESPACE_BEGIN(Grid);


#define GENERIC_STENCIL_LEG(U,Dir,skew,multLink)        \

  SE = st.GetEntry(ptype, Dir+skew, sF);            \

  if (SE->_is_local ) {                     \

    int perm= SE->_permute;                     \

    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\

  } else {                          \

    chi = coalescedRead(buf[SE->_offset],lane);         \

  }                             \

  acceleratorSynchronise();                 \

  multLink(Uchi, U[sU], chi, Dir);


#define GENERIC_STENCIL_LEG_INT(U,Dir,skew,multLink)        \

  SE = st.GetEntry(ptype, Dir+skew, sF);            \

  if (SE->_is_local ) {                     \

    int perm= SE->_permute;                     \

    chi = coalescedReadPermute(in[SE->_offset],ptype,perm,lane);\

  } else if ( st.same_node[Dir] ) {             \


    chi = coalescedRead(buf[SE->_offset],lane);                 \

  }                             \

  if (SE->_is_local || st.same_node[Dir] ) {            \


    multLink(Uchi, U[sU], chi, Dir);                \

  }


#define GENERIC_STENCIL_LEG_EXT(U,Dir,skew,multLink)        \


  SE = st.GetEntry(ptype, Dir+skew, sF);            \

  if ((!SE->_is_local) && (!st.same_node[Dir]) ) {      \

    nmu++;                          \

    chi = coalescedRead(buf[SE->_offset],lane);         \

    multLink(Uchi, U[sU], chi, Dir);                \

  }


template <class Impl>

StaggeredKernels<Impl>::StaggeredKernels(const ImplParams &p) : Base(p){};


// Generic implementation; move to different file?

// Int, Ext, Int+Ext cases for comms overlap

template <class Impl>


template <int Naik> accelerator_inline


void StaggeredKernels<Impl>::DhopSiteGeneric(StencilView &st,

                         DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,

                         SiteSpinor *buf, int sF, int sU,

                         const FermionFieldView &in, FermionFieldView &out, int dag)

{


  typedef decltype(coalescedRead(in[0])) calcSpinor;

  calcSpinor chi;

  calcSpinor Uchi;

  StencilEntry *SE;

  int ptype;

  int skew;

  const int Nsimd = SiteHalfSpinor::Nsimd();

  const int lane=acceleratorSIMTlane(Nsimd);


  //  for(int s=0;s<LLs;s++){

  //

  //    int sF=LLs*sU+s;

  {

    skew = 0;

    GENERIC_STENCIL_LEG(U,Xp,skew,Impl::multLink);

    GENERIC_STENCIL_LEG(U,Yp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(U,Zp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(U,Tp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(U,Xm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(U,Ym,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(U,Zm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(U,Tm,skew,Impl::multLinkAdd);

    if ( Naik ) {

    skew=8;

    GENERIC_STENCIL_LEG(UUU,Xp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(UUU,Yp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(UUU,Zp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(UUU,Tp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(UUU,Xm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(UUU,Ym,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(UUU,Zm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG(UUU,Tm,skew,Impl::multLinkAdd);

    }

    if ( dag ) {

      Uchi = - Uchi;

    }

    coalescedWrite(out[sF], Uchi,lane);

  }

};


  // Only contributions from interior of our node

template <class Impl>

template <int Naik> accelerator_inline


void StaggeredKernels<Impl>::DhopSiteGenericInt(StencilView &st,

                        DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,

                        SiteSpinor *buf, int sF, int sU,

                        const FermionFieldView &in, FermionFieldView &out,int dag)

{

  typedef decltype(coalescedRead(in[0])) calcSpinor;

  calcSpinor chi;

  calcSpinor Uchi;

  StencilEntry *SE;

  int ptype;

  int skew ;

  const int Nsimd = SiteHalfSpinor::Nsimd();

  const int lane=acceleratorSIMTlane(Nsimd);


  //  for(int s=0;s<LLs;s++){

  //    int sF=LLs*sU+s;

  {

    skew = 0;

    Uchi=Zero();

    GENERIC_STENCIL_LEG_INT(U,Xp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(U,Yp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(U,Zp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(U,Tp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(U,Xm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(U,Ym,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(U,Zm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(U,Tm,skew,Impl::multLinkAdd);

    if ( Naik ) {

    skew=8;

    GENERIC_STENCIL_LEG_INT(UUU,Xp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(UUU,Yp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(UUU,Zp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(UUU,Tp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(UUU,Xm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(UUU,Ym,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(UUU,Zm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_INT(UUU,Tm,skew,Impl::multLinkAdd);

    }

    if ( dag ) {

      Uchi = - Uchi;

    }

    coalescedWrite(out[sF], Uchi,lane);

  }

};


  // Only contributions from exterior of our node

template <class Impl>

template <int Naik> accelerator_inline


void StaggeredKernels<Impl>::DhopSiteGenericExt(StencilView &st,

                        DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU,

                        SiteSpinor *buf, int sF, int sU,

                        const FermionFieldView &in, FermionFieldView &out,int dag)

{

  typedef decltype(coalescedRead(in[0])) calcSpinor;

  calcSpinor chi;

  calcSpinor Uchi;

  StencilEntry *SE;

  int ptype;

  int nmu=0;

  int skew ;

  const int Nsimd = SiteHalfSpinor::Nsimd();

  const int lane=acceleratorSIMTlane(Nsimd);


  //  for(int s=0;s<LLs;s++){

  //    int sF=LLs*sU+s;

  {

    skew = 0;

    Uchi=Zero();

    GENERIC_STENCIL_LEG_EXT(U,Xp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(U,Yp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(U,Zp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(U,Tp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(U,Xm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(U,Ym,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(U,Zm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(U,Tm,skew,Impl::multLinkAdd);

    if ( Naik ) {

    skew=8;

    GENERIC_STENCIL_LEG_EXT(UUU,Xp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(UUU,Yp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(UUU,Zp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(UUU,Tp,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(UUU,Xm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(UUU,Ym,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(UUU,Zm,skew,Impl::multLinkAdd);

    GENERIC_STENCIL_LEG_EXT(UUU,Tm,skew,Impl::multLinkAdd);

    }

    if ( nmu ) {

      auto _out = coalescedRead(out[sF],lane);

      if ( dag ) {

    coalescedWrite(out[sF], _out-Uchi,lane);

      } else {

    coalescedWrite(out[sF], _out+Uchi,lane);

      }

    }

  }

};


// Driving / wrapping routine to select right kernel

template <class Impl>


void StaggeredKernels<Impl>::DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor * buf,

                       int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir,int disp)

{

  // Disp should be either +1,-1,+3,-3

  // What about "dag" ?

  // Because we work out pU . dS/dU

  // U

  assert(0);

}


#define KERNEL_CALLNB(A,improved)                   \

  const uint64_t    NN = Nsite*Ls;                  \

  accelerator_forNB( ss, NN, Simd::Nsimd(), {               \

      int sF = ss;                          \

      int sU = ss/Ls;                           \

      ThisKernel:: template A<improved>(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag); \

    });


#define KERNEL_CALL(A,improved) KERNEL_CALLNB(A,improved); accelerator_barrier();


#define ASM_CALL(A)                         \

  const uint64_t    NN = Nsite*Ls;                  \

  thread_for( ss, NN, {                         \

      int sF = ss;                          \

      int sU = ss/Ls;                           \

      ThisKernel::A(st_v,U_v,UUU_v,buf,sF,sU,in_v,out_v,dag);       \

  });


template <class Impl>


void StaggeredKernels<Impl>::DhopImproved(StencilImpl &st,

                      DoubledGaugeField &U, DoubledGaugeField &UUU,

                      const FermionField &in, FermionField &out, int dag, int interior,int exterior)

{

  GridBase *FGrid=in.Grid();

  GridBase *UGrid=U.Grid();

  typedef StaggeredKernels<Impl> ThisKernel;

  const int Nsimd = SiteHalfSpinor::Nsimd();

  const int lane=acceleratorSIMTlane(Nsimd);

  autoView( UUU_v , UUU, AcceleratorRead);

  autoView( U_v   ,   U, AcceleratorRead);

  autoView( in_v  ,  in, AcceleratorRead);

  autoView( out_v , out, AcceleratorWrite);

  autoView( st_v  ,  st, AcceleratorRead);

  SiteSpinor * buf = st.CommBuf();


  int Ls=1;

  if(FGrid->Nd()==UGrid->Nd()+1){

    Ls    = FGrid->_rdimensions[0];

  }

  int Nsite = UGrid->oSites();


  if( interior && exterior ) {

    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,1); return;}

    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,1);    return;}

#ifndef GRID_CUDA

    if (Opt == OptInlineAsm  ) {  ASM_CALL(DhopSiteAsm);     return;}

#endif

  } else if( interior ) {

    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,1); return;}

    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,1);    return;}

  } else if( exterior ) {

    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,1); return;}

    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,1);    return;}

  }

  assert(0 && " Kernel optimisation case not covered ");

}


template <class Impl>


void StaggeredKernels<Impl>::DhopNaive(StencilImpl &st,

                       DoubledGaugeField &U,

                       const FermionField &in, FermionField &out, int dag, int interior,int exterior)

{

  GridBase *FGrid=in.Grid();

  GridBase *UGrid=U.Grid();

  typedef StaggeredKernels<Impl> ThisKernel;

  const int Nsimd = SiteHalfSpinor::Nsimd();

  const int lane=acceleratorSIMTlane(Nsimd);

  autoView( UUU_v ,   U, AcceleratorRead);

  autoView( U_v   ,   U, AcceleratorRead);

  autoView( in_v  ,  in, AcceleratorRead);

  autoView( out_v , out, AcceleratorWrite);

  autoView( st_v  ,  st, AcceleratorRead);

  SiteSpinor * buf = st.CommBuf();


  int Ls=1;

  if(FGrid->Nd()==UGrid->Nd()+1){

    Ls    = FGrid->_rdimensions[0];

  }

  int Nsite = UGrid->oSites();


  if( interior && exterior ) {

    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGeneric,0); return;}

    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHand,0);    return;}

  } else if( interior ) {

    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericInt,0); return;}

    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandInt,0);    return;}

  } else if( exterior ) {

    if (Opt == OptGeneric    ) { KERNEL_CALL(DhopSiteGenericExt,0); return;}

    if (Opt == OptHandUnroll ) { KERNEL_CALL(DhopSiteHandExt,0);    return;}

  }

}


#undef KERNEL_CALLNB

#undef KERNEL_CALL

#undef ASM_CALL


NAMESPACE_END(Grid);


acceleratorSIMTlane
accelerator_inline int acceleratorSIMTlane(int Nsimd)
Definition Accelerator.h:614

accelerator_inline
#define accelerator_inline
Definition Accelerator.h:608

FermionCore.h

autoView
#define autoView(l_v, l, mode)
Definition Lattice_view.h:119

AcceleratorRead
@ AcceleratorRead
Definition MemoryManager.h:66

AcceleratorWrite
@ AcceleratorWrite
Definition MemoryManager.h:67

NAMESPACE_BEGIN
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35

NAMESPACE_END
#define NAMESPACE_END(A)
Definition Namespace.h:36

Xm
static constexpr int Xm
Definition QCD.h:45

Tm
static constexpr int Tm
Definition QCD.h:48

Tp
static constexpr int Tp
Definition QCD.h:44

Zp
static constexpr int Zp
Definition QCD.h:43

Zm
static constexpr int Zm
Definition QCD.h:47

Xp
static constexpr int Xp
Definition QCD.h:41

Yp
static constexpr int Yp
Definition QCD.h:42

Ym
static constexpr int Ym
Definition QCD.h:46

GENERIC_STENCIL_LEG_EXT
#define GENERIC_STENCIL_LEG_EXT(U, Dir, skew, multLink)
Definition StaggeredKernelsImplementation.h:58

GENERIC_STENCIL_LEG_INT
#define GENERIC_STENCIL_LEG_INT(U, Dir, skew, multLink)
Definition StaggeredKernelsImplementation.h:46

ASM_CALL
#define ASM_CALL(A)
Definition StaggeredKernelsImplementation.h:250

KERNEL_CALL
#define KERNEL_CALL(A, improved)
Definition StaggeredKernelsImplementation.h:248

GENERIC_STENCIL_LEG
#define GENERIC_STENCIL_LEG(U, Dir, skew, multLink)
Definition StaggeredKernelsImplementation.h:35

coalescedWrite
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87

coalescedRead
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61

ptype
int ptype
Definition WilsonKernelsAsmBody.h:130

U
static INTERNAL_PRECISION U
Definition Zolotarev.cc:230

GridBase
Definition Cartesian_base.h:43

GridBase::oSites
int oSites(void) const
Definition Cartesian_base.h:185

GridBase::_rdimensions
Coordinate _rdimensions
Definition Cartesian_base.h:68

GridBase::Nd
int Nd(void) const
Definition Cartesian_base.h:188

StaggeredKernelsStatic::Opt
static int Opt
Definition StaggeredKernels.h:40

StaggeredKernelsStatic::OptInlineAsm
@ OptInlineAsm
Definition StaggeredKernels.h:38

StaggeredKernelsStatic::OptHandUnroll
@ OptHandUnroll
Definition StaggeredKernels.h:38

StaggeredKernelsStatic::OptGeneric
@ OptGeneric
Definition StaggeredKernels.h:38

StaggeredKernels::DhopNaive
void DhopNaive(StencilImpl &st, DoubledGaugeField &U, const FermionField &in, FermionField &out, int dag, int interior, int exterior)
Definition StaggeredKernelsImplementation.h:297

StaggeredKernels::DhopSiteAsm
void DhopSiteAsm(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition StaggeredKernelsAsm.h:621

StaggeredKernels::DhopSiteHandExt
static accelerator_inline void DhopSiteHandExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition StaggeredKernelsHand.h:315

StaggeredKernels::DhopSiteHand
static accelerator_inline void DhopSiteHand(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition StaggeredKernelsHand.h:190

StaggeredKernels::DhopSiteHandInt
static accelerator_inline void DhopSiteHandInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition StaggeredKernelsHand.h:251

StaggeredKernels::DhopDirKernel
void DhopDirKernel(StencilImpl &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int sF, int sU, const FermionFieldView &in, FermionFieldView &out, int dir, int disp)
Definition StaggeredKernelsImplementation.h:230

StaggeredKernels::StaggeredKernels
StaggeredKernels(const ImplParams &p=ImplParams())
Definition StaggeredKernelsImplementation.h:67

StaggeredKernels::DhopImproved
void DhopImproved(StencilImpl &st, DoubledGaugeField &U, DoubledGaugeField &UUU, const FermionField &in, FermionField &out, int dag, int interior, int exterior)
Definition StaggeredKernelsImplementation.h:259

StaggeredKernels::DhopSiteGenericExt
static accelerator_inline void DhopSiteGenericExt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition StaggeredKernelsImplementation.h:176

StaggeredKernels::DhopSiteGenericInt
static accelerator_inline void DhopSiteGenericInt(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition StaggeredKernelsImplementation.h:125

StaggeredKernels::Base
FermionOperator< Impl > Base
Definition StaggeredKernels.h:48

StaggeredKernels::DhopSiteGeneric
static accelerator_inline void DhopSiteGeneric(StencilView &st, DoubledGaugeFieldView &U, DoubledGaugeFieldView &UUU, SiteSpinor *buf, int LLs, int sU, const FermionFieldView &in, FermionFieldView &out, int dag)
Definition StaggeredKernelsImplementation.h:75

Zero
Definition Simd.h:194

Grid
Definition Deflation.h:31

StencilEntry
Definition Stencil.h:84