4extern hipDeviceProp_t *gpu_props;
8extern cudaDeviceProp *gpu_props;
14template <
class Iterator>
25template <
class Iterator>
30 cudaGetDevice(&device);
33 auto r=hipGetDevice(&device);
36 Iterator warpSize = gpu_props[device].warpSize;
37 Iterator sharedMemPerBlock = gpu_props[device].sharedMemPerBlock;
38 Iterator maxThreadsPerBlock = gpu_props[device].maxThreadsPerBlock;
39 Iterator multiProcessorCount = gpu_props[device].multiProcessorCount;
47 if (warpSize != WARP_SIZE) {
48 std::cout <<
GridLogError <<
"The warp size of the GPU in use does not match the warp size set when compiling Grid." << std::endl;
54 if ( threads*sizeofsobj > sharedMemPerBlock ) {
55 std::cout <<
GridLogError <<
"The object is too large for the shared memory." << std::endl;
58 while( 2*threads*sizeofsobj < sharedMemPerBlock && 2*threads <= maxThreadsPerBlock ) threads *= 2;
60 blocks =
nextPow2(multiProcessorCount);
64template <
class sobj,
class Iterator>
65__device__
void reduceBlock(
volatile sobj *sdata, sobj mySum,
const Iterator tid) {
67 Iterator blockSize = blockDim.x;
70 memcpy((
void *)&sdata[tid], (
void *)&mySum,
sizeof(sobj));
73 const Iterator VEC = WARP_SIZE;
74 const Iterator vid = tid & (VEC-1);
77 memcpy((
void *)&beta, (
void *)&mySum,
sizeof(sobj));
79 for (
int i = VEC/2; i > 0; i>>=1) {
81 memcpy((
void *)&temp, (
void *)&sdata[tid+i],
sizeof(sobj));
83 memcpy((
void *)&sdata[tid], (
void *)&beta,
sizeof(sobj));
89 if (threadIdx.x == 0) {
91 for (Iterator i = 0; i < blockSize; i += VEC) {
92 memcpy((
void *)&temp, (
void *)&sdata[i],
sizeof(sobj));
95 memcpy((
void *)&sdata[0], (
void *)&beta,
sizeof(sobj));
101template <
class vobj,
class sobj,
class Iterator>
102__device__
void reduceBlocks(
const vobj *g_idata, sobj *g_odata, Iterator n)
104 constexpr Iterator nsimd = vobj::Nsimd();
106 Iterator blockSize = blockDim.x;
112 sobj *sdata = (sobj *)shmem_pointer;
116 Iterator tid = threadIdx.x;
117 Iterator i = blockIdx.x*(blockSize*2) + threadIdx.x;
118 Iterator gridSize = blockSize*2*gridDim.x;
122 Iterator lane = i % nsimd;
123 Iterator ss = i / nsimd;
129 if (i + blockSize < n) {
130 lane = (i+blockSize) % nsimd;
131 ss = (i+blockSize) / nsimd;
142 if (tid == 0) g_odata[blockIdx.x] = sdata[0];
145template <
class vobj,
class sobj,
class Iterator>
146__global__
void reduceKernel(
const vobj *lat, sobj *buffer, Iterator n) {
148 Iterator blockSize = blockDim.x;
156 const Iterator tid = threadIdx.x;
157 __shared__
bool amLast;
162 sobj *smem = (sobj *)shmem_pointer;
170 amLast = (ticket == gridDim.x-1);
181 while (i < gridDim.x) {
203 typedef typename vobj::scalar_objectD sobj;
204 typedef decltype(lat) Iterator;
213 Integer smemSize = numThreads *
sizeof(sobj);
218 sobj *buffer_v = &buffer[0];
229 typedef typename vobj::vector_type vector;
230 typedef typename vobj::scalar_typeD scalarD;
231 typedef typename vobj::scalar_objectD sobj;
233 scalarD *ret_p = (scalarD *)&ret;
235 const int words =
sizeof(vobj)/
sizeof(vector);
238 vector *dat = (vector *)lat;
239 vector *buf = &buffer[0];
241 for(
int w=0;w<words;w++) {
244 buf[ss] = dat[ss*words+w];
255 typedef typename vobj::scalar_objectD sobj;
277 typedef typename vobj::scalar_object sobj;
286 typedef typename vobj::scalar_object sobj;
accelerator_inline void acceleratorSynchroniseAll(void)
accelerator_inline void acceleratorSynchronise(void)
accelerator_inline void acceleratorFence(void)
#define accelerator_for(iterator, num, nsimd,...)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
#define accelerator_barrier(dummy)
std::vector< T, devAllocator< T > > deviceVector
#define COALESCE_GRANULARITY
vobj::scalar_objectD sumD_gpu_small(const vobj *lat, Integer osites)
vobj::scalar_object sum_gpu_large(const vobj *lat, Integer osites)
vobj::scalar_objectD sumD_gpu(const vobj *lat, Integer osites)
__device__ void reduceBlocks(const vobj *g_idata, sobj *g_odata, Iterator n)
__global__ void reduceKernel(const vobj *lat, sobj *buffer, Iterator n)
vobj::scalar_objectD sumD_gpu_large(const vobj *lat, Integer osites)
unsigned int nextPow2(Iterator x)
int getNumBlocksAndThreads(const Iterator n, const size_t sizeofsobj, Iterator &threads, Iterator &blocks)
__device__ unsigned int retirementCount
vobj::scalar_object sum_gpu(const vobj *lat, Integer osites)
__device__ void reduceBlock(volatile sobj *sdata, sobj mySum, const Iterator tid)
GridLogger GridLogError(1, "Error", GridLogColours, "RED")
#define NAMESPACE_BEGIN(A)