Grid 0.7.0
Tensor_SIMT.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/tensors/Tensor_SIMT.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10
11 This program is free software; you can redistribute it and/or modify
12 it under the terms of the GNU General Public License as published by
13 the Free Software Foundation; either version 2 of the License, or
14 (at your option) any later version.
15
16 This program is distributed in the hope that it will be useful,
17 but WITHOUT ANY WARRANTY; without even the implied warranty of
18 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 GNU General Public License for more details.
20
21 You should have received a copy of the GNU General Public License along
22 with this program; if not, write to the Free Software Foundation, Inc.,
23 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
24
25 See the full license in the file "LICENSE" in the top level distribution directory
26*************************************************************************************/
27/* END LEGAL */
28#pragma once
29
30#include <string.h>
31
33
35// Inside a GPU thread
37template<class vobj>
38accelerator_inline void exchangeSIMT(vobj &mp0,vobj &mp1,const vobj &vp0,const vobj &vp1,Integer type)
39{
40 typedef decltype(coalescedRead(mp0)) sobj;
41 unsigned int Nsimd = vobj::Nsimd();
42 unsigned int mask = Nsimd >> (type + 1);
43 int lane = acceleratorSIMTlane(Nsimd);
44 int j0 = lane &(~mask); // inner coor zero
45 int j1 = lane |(mask) ; // inner coor one
46 const vobj *vpa = &vp0;
47 const vobj *vpb = &vp1;
48 const vobj *vp = (lane&mask) ? (vpb) : (vpa);
49 auto sa = coalescedRead(vp[0],j0);
50 auto sb = coalescedRead(vp[0],j1);
51 coalescedWrite(mp0,sa);
52 coalescedWrite(mp1,sb);
53}
54
55
56#ifndef GRID_SIMT
58// Trivial mapping of vectors on host
60template<class vobj> accelerator_inline
61vobj coalescedRead(const vobj & __restrict__ vec,int lane=0)
62{
63 return vec;
64}
65template<class vobj> accelerator_inline
66vobj coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=0)
67{
68 if ( doperm ) {
69 vobj ret;
70 permute(ret,vec, ptype);
71 return ret;
72 } else {
73 return vec;
74 }
75}
76//'perm_mask' acts as a bitmask
77template<class vobj> accelerator_inline
78vobj coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=0)
79{
80 auto obj = vec, tmp = vec;
81 for (int d=0;d<nd;d++)
82 if (perm_mask & (0x1 << d)) { permute(obj,tmp,d); tmp=obj;}
83 return obj;
84}
85
86template<class vobj> accelerator_inline
87void coalescedWrite(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
88{
89 vec = extracted;
90}
91template<class vobj> accelerator_inline
92void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
93{
94 vstream(vec, extracted);
95}
96#else //==GRID_SIMT
97
98
99//#ifndef GRID_SYCL
100#if 1
101// Use the scalar as our own complex on GPU ... thrust::complex or std::complex
102template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
103typename vsimd::scalar_type
104coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
105{
106 typedef typename vsimd::scalar_type S;
107 S * __restrict__ p=(S *)&vec;
108 return p[lane];
109}
110template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
111typename vsimd::scalar_type
112coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
113{
114 typedef typename vsimd::scalar_type S;
115
116 S * __restrict__ p=(S *)&vec;
117 int mask = vsimd::Nsimd() >> (ptype + 1);
118 int plane= doperm ? lane ^ mask : lane;
119 return p[plane];
120}
121template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
122void coalescedWrite(vsimd & __restrict__ vec,
123 const typename vsimd::scalar_type & __restrict__ extracted,
124 int lane=acceleratorSIMTlane(vsimd::Nsimd()))
125{
126 typedef typename vsimd::scalar_type S;
127 S * __restrict__ p=(S *)&vec;
128 p[lane]=extracted;
129}
130#else
131// For SyCL have option to use GpuComplex from inside the vector type in SIMT loops
132// Faster for some reason
133template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
134typename vsimd::vector_type::datum
135coalescedRead(const vsimd & __restrict__ vec,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
136{
137 typedef typename vsimd::vector_type::datum S;
138 S * __restrict__ p=(S *)&vec;
139 return p[lane];
140}
141template<int ptype,class vsimd,IfSimd<vsimd> = 0> accelerator_inline
142typename vsimd::vector_type::datum
143coalescedReadPermute(const vsimd & __restrict__ vec,int doperm,int lane=acceleratorSIMTlane(vsimd::Nsimd()))
144{
145 typedef typename vsimd::vector_type::datum S;
146
147 S * __restrict__ p=(S *)&vec;
148 int mask = vsimd::Nsimd() >> (ptype + 1);
149 int plane= doperm ? lane ^ mask : lane;
150 return p[plane];
151}
152template<class vsimd,IfSimd<vsimd> = 0> accelerator_inline
153void coalescedWrite(vsimd & __restrict__ vec,
154 const typename vsimd::vector_type::datum & __restrict__ extracted,
155 int lane=acceleratorSIMTlane(vsimd::Nsimd()))
156{
157 typedef typename vsimd::vector_type::datum S;
158 S * __restrict__ p=(S *)&vec;
159 p[lane]=extracted;
160}
161#endif
162
164// Extract and insert slices on the GPU
166template<class vobj> accelerator_inline
167typename vobj::scalar_object coalescedRead(const vobj & __restrict__ vec,int lane=acceleratorSIMTlane(vobj::Nsimd()))
168{
169 return extractLane(lane,vec);
170}
171template<class vobj> accelerator_inline
172typename vobj::scalar_object coalescedReadPermute(const vobj & __restrict__ vec,int ptype,int doperm,int lane=acceleratorSIMTlane(vobj::Nsimd()))
173{
174 int mask = vobj::Nsimd() >> (ptype + 1);
175 int plane= doperm ? lane ^ mask : lane;
176 return extractLane(plane,vec);
177}
178template<class vobj> accelerator_inline
179typename vobj::scalar_object coalescedReadGeneralPermute(const vobj & __restrict__ vec,int perm_mask,int nd,int lane=acceleratorSIMTlane(vobj::Nsimd()))
180{
181 int plane = lane;
182 for (int d=0;d<nd;d++)
183 plane = (perm_mask & (0x1 << d)) ? plane ^ (vobj::Nsimd() >> (d + 1)) : plane;
184 return extractLane(plane,vec);
185}
186template<class vobj> accelerator_inline
187void coalescedWrite(vobj & __restrict__ vec,const typename vobj::scalar_object & __restrict__ extracted,int lane=acceleratorSIMTlane(vobj::Nsimd()))
188{
189 insertLane(lane,vec,extracted);
190}
191template<class vobj> accelerator_inline
192void coalescedWriteNonTemporal(vobj & __restrict__ vec,const vobj & __restrict__ extracted,int lane=0)
193{
194 insertLane(lane,vec,extracted);
195}
196#endif
197
198
200
accelerator_inline int acceleratorSIMTlane(int Nsimd)
#define accelerator_inline
accelerator_inline void vstream(Grid_simd2< S, V > &out, const Grid_simd2< S, V > &in)
accelerator_inline void permute(ComplexD &y, ComplexD b, int perm)
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
accelerator_inline vobj coalescedReadPermute(const vobj &__restrict__ vec, int ptype, int doperm, int lane=0)
Definition Tensor_SIMT.h:66
accelerator_inline void coalescedWriteNonTemporal(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:92
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
Definition Tensor_SIMT.h:87
accelerator_inline void exchangeSIMT(vobj &mp0, vobj &mp1, const vobj &vp0, const vobj &vp1, Integer type)
Definition Tensor_SIMT.h:38
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
Definition Tensor_SIMT.h:61
accelerator_inline vobj coalescedReadGeneralPermute(const vobj &__restrict__ vec, int perm_mask, int nd, int lane=0)
Definition Tensor_SIMT.h:78
accelerator_inline void insertLane(int lane, vobj &__restrict__ vec, const typename vobj::scalar_object &__restrict__ extracted)
accelerator_inline vobj::scalar_object extractLane(int lane, const vobj &__restrict__ vec)