Grid/dev/Tensor__SIMT_8h_source.html

/*************************************************************************************


    Grid physics library, www.github.com/paboyle/Grid


    Source file: ./lib/tensors/Tensor_SIMT.h


    Copyright (C) 2015


Author: Peter Boyle <paboyle@ph.ed.ac.uk>


    This program is free software; you can redistribute it and/or modify

    it under the terms of the GNU General Public License as published by

    the Free Software Foundation; either version 2 of the License, or

    (at your option) any later version.


    This program is distributed in the hope that it will be useful,

    but WITHOUT ANY WARRANTY; without even the implied warranty of

    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the

    GNU General Public License for more details.


    You should have received a copy of the GNU General Public License along

    with this program; if not, write to the Free Software Foundation, Inc.,

    51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.


    See the full license in the file "LICENSE" in the top level distribution directory

*************************************************************************************/

/*  END LEGAL */

#pragma once


#include <string.h>


NAMESPACE_BEGIN(Grid);


// Inside a GPU thread

template<class vobj>


accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type)

{

    typedef decltype(coalescedRead(mp0)) sobj;

    unsigned int Nsimd = vobj::Nsimd();

    unsigned int mask = Nsimd >> (type + 1);

    int lane = acceleratorSIMTlane(Nsimd);

    int j0 = lane &(~mask); // inner coor zero

    int j1 = lane |(mask) ; // inner coor one

    const vobj *vpa = &vp0;

    const vobj *vpb = &vp1;

    const vobj *vp = (lane&mask) ? (vpb) : (vpa);

    auto sa = coalescedRead(vp[0],j0);

    auto sb = coalescedRead(vp[0],j1);

    coalescedWrite(mp0,sa);

    coalescedWrite(mp1,sb);

}


#ifndef GRID_SIMT

// Trivial mapping of vectors on host

template<class vobj> accelerator_inline


vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)

{

  return vec;

}


template<class vobj> accelerator_inline


vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=0)

{

  if ( doperm ) {

    vobj ret;

    permute(ret,vec, ptype);

    return ret;

  } else {

    return vec;

  }

}


//'perm_mask' acts as a bitmask

template<class vobj> accelerator_inline


vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)

{

  auto obj = vec, tmp = vec;

  for (int d=0;d<nd;d++)

    if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}

  return obj;

}


template<class vobj> accelerator_inline


void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)

{

  vec = extracted;

}


template<class vobj> accelerator_inline


void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)

{

  vstream(vec, extracted);

}


#else //==GRID_SIMT


//#ifndef GRID_SYCL

#if 1

// Use the scalar as our own complex on GPU ... thrust::complex or std::complex

template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline

typename vsimd::scalar_type

coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))

{

  typedef typename vsimd::scalar_type S;

  S * __restrict__ p=(S *)&vec;

  return p[lane];

}

template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline

typename vsimd::scalar_type

coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))

{

  typedef typename vsimd::scalar_type S;


  S * __restrict__ p=(S *)&vec;

  int mask = vsimd::Nsimd() >> (ptype + 1);

  int plane= doperm ? lane ^ mask : lane;

  return p[plane];

}

template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline

void coalescedWrite(vsimd & __restrict__ vec,

            const typename vsimd::scalar_type & __restrict__ extracted,

            int lane=acceleratorSIMTlane(vsimd::Nsimd()))

{

  typedef typename vsimd::scalar_type S;

  S * __restrict__ p=(S *)&vec;

  p[lane]=extracted;

}

#else

// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops

// Faster for some reason

template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline

typename vsimd::vector_type::datum

coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))

{

  typedef typename vsimd::vector_type::datum S;

  S * __restrict__ p=(S *)&vec;

  return p[lane];

}

template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline

typename vsimd::vector_type::datum

coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))

{

  typedef typename vsimd::vector_type::datum S;


  S * __restrict__ p=(S *)&vec;

  int mask = vsimd::Nsimd() >> (ptype + 1);

  int plane= doperm ? lane ^ mask : lane;

  return p[plane];

}

template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline

void coalescedWrite(vsimd & __restrict__ vec,

            const typename vsimd::vector_type::datum & __restrict__ extracted,

            int lane=acceleratorSIMTlane(vsimd::Nsimd()))

{

  typedef typename vsimd::vector_type::datum S;

  S * __restrict__ p=(S *)&vec;

  p[lane]=extracted;

}

#endif


// Extract and insert slices on the GPU

template<class vobj> accelerator_inline

typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))

{

  return extractLane(lane,vec);

}

template<class vobj> accelerator_inline

typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))

{

  int mask = vobj::Nsimd() >> (ptype + 1);

  int plane= doperm ? lane ^ mask : lane;

  return extractLane(plane,vec);

}

template<class vobj> accelerator_inline

typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))

{

  int plane = lane;

  for (int d=0;d<nd;d++)

    plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;

  return extractLane(plane,vec);

}

template<class vobj> accelerator_inline

void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))

{

  insertLane(lane,vec,extracted);

}

template<class vobj> accelerator_inline

void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)

{

  insertLane(lane,vec,extracted);

}

#endif


NAMESPACE_END(Grid);


acceleratorSIMTlane
accelerator_inline int acceleratorSIMTlane(int Nsimd)
Definition Accelerator.h:614

accelerator_inline
#define accelerator_inline
Definition Accelerator.h:608

vstream
accelerator_inline void vstream(Grid_simd2< S, V > &out, const Grid_simd2< S, V > &in)
Definition Grid_doubled_vector.h:501

permute
accelerator_inline void permute(ComplexD &y, ComplexD b, int perm)
Definition Grid_vector_types.h:688

NAMESPACE_BEGIN
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35

NAMESPACE_END
#define NAMESPACE_END(A)
Definition Namespace.h:36

Integer
uint32_t Integer
Definition Simd.h:58

coalescedReadPermute
accelerator_inline vobj coalescedReadPermute(const vobj &__restrict__ vec, int ptype, int doperm, int lane=0)
Definition Tensor_SIMT.h:66

coalescedWriteNonTemporal
accelerator_inline void coalescedWriteNonTemporal(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:92

coalescedWrite
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87

exchangeSIMT
accelerator_inline void exchangeSIMT(vobj &mp0, vobj &mp1, const vobj &vp0, const vobj &vp1, Integer type)
Definition Tensor_SIMT.h:38

coalescedRead
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61

coalescedReadGeneralPermute
accelerator_inline vobj coalescedReadGeneralPermute(const vobj &__restrict__ vec, int perm_mask, int nd, int lane=0)
Definition Tensor_SIMT.h:78

insertLane
accelerator_inline void insertLane(int lane, vobj &__restrict__ vec, const typename vobj::scalar_object &__restrict__ extracted)
Definition Tensor_extract_merge.h:147

extractLane
accelerator_inline vobj::scalar_object extractLane(int lane, const vobj &__restrict__ vec)
Definition Tensor_extract_merge.h:127

ptype
int ptype
Definition WilsonKernelsAsmBody.h:130

Grid
Definition Deflation.h:31

type

vec
Definition Grid_a64fx-2.h:91