Grid 0.7.0
Lattice_reduction_gpu.h
Go to the documentation of this file.
2
3#ifdef GRID_HIP
4extern hipDeviceProp_t *gpu_props;
5#define WARP_SIZE 64
6#endif
7#ifdef GRID_CUDA
8extern cudaDeviceProp *gpu_props;
9#define WARP_SIZE 32
10#endif
11
12__device__ unsigned int retirementCount = 0;
13
14template <class Iterator>
15unsigned int nextPow2(Iterator x) {
16 --x;
17 x |= x >> 1;
18 x |= x >> 2;
19 x |= x >> 4;
20 x |= x >> 8;
21 x |= x >> 16;
22 return ++x;
23}
24
25template <class Iterator>
26int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks) {
27
28 int device;
29#ifdef GRID_CUDA
30 cudaGetDevice(&device);
31#endif
32#ifdef GRID_HIP
33 auto r=hipGetDevice(&device);
34#endif
35
36 Iterator warpSize = gpu_props[device].warpSize;
37 Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
38 Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
39 Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
40 /*
41 std::cout << GridLogDebug << "GPU has:" << std::endl;
42 std::cout << GridLogDebug << "\twarpSize = " << warpSize << std::endl;
43 std::cout << GridLogDebug << "\tsharedMemPerBlock = " << sharedMemPerBlock << std::endl;
44 std::cout << GridLogDebug << "\tmaxThreadsPerBlock = " << maxThreadsPerBlock << std::endl;
45 std::cout << GridLogDebug << "\tmultiProcessorCount = " << multiProcessorCount << std::endl;
46 */
47 if (warpSize != WARP_SIZE) {
48 std::cout << GridLogError << "The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
49 exit(EXIT_FAILURE);
50 }
51
52 // let the number of threads in a block be a multiple of 2, starting from warpSize
53 threads = warpSize;
54 if ( threads*sizeofsobj > sharedMemPerBlock ) {
55 std::cout << GridLogError << "The object is too large for the shared memory." << std::endl;
56 return 0;
57 }
58 while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
59 // keep all the streaming multiprocessors busy
60 blocks = nextPow2(multiProcessorCount);
61 return 1;
62}
63
64template <class sobj, class Iterator>
65__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid) {
66
67 Iterator blockSize = blockDim.x;
68
69 // cannot use overloaded operators for sobj as they are not volatile-qualified
70 memcpy((void *)&sdata[tid], (void *)&mySum, sizeof(sobj));
72
73 const Iterator VEC = WARP_SIZE;
74 const Iterator vid = tid & (VEC-1);
75
76 sobj beta, temp;
77 memcpy((void *)&beta, (void *)&mySum, sizeof(sobj));
78
79 for (int i = VEC/2; i > 0; i>>=1) {
80 if (vid < i) {
81 memcpy((void *)&temp, (void *)&sdata[tid+i], sizeof(sobj));
82 beta += temp;
83 memcpy((void *)&sdata[tid], (void *)&beta, sizeof(sobj));
84 }
86 }
88
89 if (threadIdx.x == 0) {
90 beta = Zero();
91 for (Iterator i = 0; i < blockSize; i += VEC) {
92 memcpy((void *)&temp, (void *)&sdata[i], sizeof(sobj));
93 beta += temp;
94 }
95 memcpy((void *)&sdata[0], (void *)&beta, sizeof(sobj));
96 }
98}
99
100
101template <class vobj, class sobj, class Iterator>
102__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
103{
104 constexpr Iterator nsimd = vobj::Nsimd();
105
106 Iterator blockSize = blockDim.x;
107
108 // force shared memory alignment
109 extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
110 // it's not possible to have two extern __shared__ arrays with same name
111 // but different types in different scopes -- need to cast each time
112 sobj *sdata = (sobj *)shmem_pointer;
113
114 // first level of reduction,
115 // each thread writes result in mySum
116 Iterator tid = threadIdx.x;
117 Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
118 Iterator gridSize = blockSize*2*gridDim.x;
119 sobj mySum = Zero();
120
121 while (i < n) {
122 Iterator lane = i % nsimd;
123 Iterator ss = i / nsimd;
124 auto tmp = extractLane(lane,g_idata[ss]);
125 sobj tmpD;
126 tmpD=tmp;
127 mySum +=tmpD;
128
129 if (i + blockSize < n) {
130 lane = (i+blockSize) % nsimd;
131 ss = (i+blockSize) / nsimd;
132 tmp = extractLane(lane,g_idata[ss]);
133 tmpD = tmp;
134 mySum += tmpD;
135 }
136 i += gridSize;
137 }
138
139 // copy mySum to shared memory and perform
140 // reduction for all threads in this block
141 reduceBlock(sdata, mySum, tid);
142 if (tid == 0) g_odata[blockIdx.x] = sdata[0];
143}
144
145template <class vobj, class sobj,class Iterator>
146__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n) {
147
148 Iterator blockSize = blockDim.x;
149
150 // perform reduction for this block and
151 // write result to global memory buffer
152 reduceBlocks(lat, buffer, n);
153
154 if (gridDim.x > 1) {
155
156 const Iterator tid = threadIdx.x;
157 __shared__ bool amLast;
158 // force shared memory alignment
159 extern __shared__ __align__(COALESCE_GRANULARITY) unsigned char shmem_pointer[];
160 // it's not possible to have two extern __shared__ arrays with same name
161 // but different types in different scopes -- need to cast each time
162 sobj *smem = (sobj *)shmem_pointer;
163
164 // wait until all outstanding memory instructions in this thread are finished
166
167 if (tid==0) {
168 unsigned int ticket = atomicInc(&retirementCount, gridDim.x);
169 // true if this block is the last block to be done
170 amLast = (ticket == gridDim.x-1);
171 }
172
173 // each thread must read the correct value of amLast
175
176 if (amLast) {
177 // reduce buffer[0], ..., buffer[gridDim.x-1]
178 Iterator i = tid;
179 sobj mySum = Zero();
180
181 while (i < gridDim.x) {
182 mySum += buffer[i];
183 i += blockSize;
184 }
185
186 reduceBlock(smem, mySum, tid);
187
188 if (tid==0) {
189 buffer[0] = smem[0];
190 // reset count variable
191 retirementCount = 0;
192 }
193 }
194 }
195}
196
198// Possibly promote to double and sum
200template <class vobj>
201inline typename vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
202{
203 typedef typename vobj::scalar_objectD sobj;
204 typedef decltype(lat) Iterator;
205
206 Integer nsimd= vobj::Nsimd();
207 Integer size = osites*nsimd;
208
209 Integer numThreads, numBlocks;
210 int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
211 assert(ok);
212
213 Integer smemSize = numThreads * sizeof(sobj);
214 // Move out of UVM
215 // Turns out I had messed up the synchronise after move to compute stream
216 // as running this on the default stream fools the synchronise
217 deviceVector<sobj> buffer(numBlocks);
218 sobj *buffer_v = &buffer[0];
219 sobj result;
222 acceleratorCopyFromDevice(buffer_v,&result,sizeof(result));
223 return result;
224}
225
226template <class vobj>
227inline typename vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
228{
229 typedef typename vobj::vector_type vector;
230 typedef typename vobj::scalar_typeD scalarD;
231 typedef typename vobj::scalar_objectD sobj;
232 sobj ret;
233 scalarD *ret_p = (scalarD *)&ret;
234
235 const int words = sizeof(vobj)/sizeof(vector);
236
237 deviceVector<vector> buffer(osites);
238 vector *dat = (vector *)lat;
239 vector *buf = &buffer[0];
240 iScalar<vector> *tbuf =(iScalar<vector> *) &buffer[0];
241 for(int w=0;w<words;w++) {
242
243 accelerator_for(ss,osites,1,{
244 buf[ss] = dat[ss*words+w];
245 });
246
247 ret_p[w] = sumD_gpu_small(tbuf,osites);
248 }
249 return ret;
250}
251
252template <class vobj>
253inline typename vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
254{
255 typedef typename vobj::scalar_objectD sobj;
256 sobj ret;
257
258 Integer nsimd= vobj::Nsimd();
259 Integer size = osites*nsimd;
260 Integer numThreads, numBlocks;
261 int ok = getNumBlocksAndThreads(size, sizeof(sobj), numThreads, numBlocks);
262
263 if ( ok ) {
264 ret = sumD_gpu_small(lat,osites);
265 } else {
266 ret = sumD_gpu_large(lat,osites);
267 }
268 return ret;
269}
270
272// Return as same precision as input performing reduction in double precision though
274template <class vobj>
275inline typename vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
276{
277 typedef typename vobj::scalar_object sobj;
278 sobj result;
279 result = sumD_gpu(lat,osites);
280 return result;
281}
282
283template <class vobj>
284inline typename vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
285{
286 typedef typename vobj::scalar_object sobj;
287 sobj result;
288 result = sumD_gpu_large(lat,osites);
289 return result;
290}
291
accelerator_inline void acceleratorSynchroniseAll(void)
accelerator_inline void acceleratorSynchronise(void)
accelerator_inline void acceleratorFence(void)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
#define accelerator_barrier(dummy)
std::vector< T, devAllocator< T > > deviceVector
#define COALESCE_GRANULARITY
vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n)
vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
unsigned int nextPow2(Iterator x)
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks)
__device__ unsigned int retirementCount
vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid)
GridLogger GridLogError(1, "Error", GridLogColours, "RED")
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
uint32_t Integer
Definition Simd.h:58
accelerator_inline vobj::scalar_object extractLane(int lane, const vobj &__restrict__ vec)
Definition Simd.h:194