7#define gpuError_t cudaError_t
8#define gpuSuccess cudaSuccess
10#elif defined(GRID_HIP)
12#include <hipcub/hipcub.hpp>
14#define gpuError_t hipError_t
15#define gpuSuccess hipSuccess
23#if defined(GRID_CUDA) || defined(GRID_HIP)
25inline void sliceSumReduction_cub_small(
const vobj *Data,
26 std::vector<vobj> &lvSum,
34 size_t subvol_size = e1*e2;
36 auto rb_p = &reduction_buffer[0];
41 void *temp_storage_array = NULL;
42 size_t temp_storage_bytes = 0;
46 std::vector<int> offsets(rd+1,0);
48 for (
int i = 0; i < offsets.size(); i++) {
49 offsets[i] = i*subvol_size;
61 gpuError_t gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p,d_out, rd, d_offsets, d_offsets+1, ::gpucub::Sum(), zero_init, computeStream);
62 if (gpuErr!=gpuSuccess) {
63 std::cout <<
GridLogError <<
"Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce (setup)! Error: " << gpuErr <<std::endl;
73 accelerator_for2dNB( s,subvol_size, r,rd, Nsimd,{
78 int ss= so+n*stride+b;
85 gpuErr = gpucub::DeviceSegmentedReduce::Reduce(temp_storage_array, temp_storage_bytes, rb_p, d_out, rd, d_offsets, d_offsets+1,::gpucub::Sum(), zero_init, computeStream);
86 if (gpuErr!=gpuSuccess) {
87 std::cout <<
GridLogError <<
"Lattice_slicesum_gpu.h: Encountered error during gpucub::DeviceSegmentedReduce::Reduce! Error: " << gpuErr <<std::endl;
105#if defined(GRID_SYCL)
107inline void sliceSumReduction_sycl_small(
const vobj *Data,
108 std::vector <vobj> &lvSum,
116 size_t subvol_size = e1*e2;
118 vobj *mysum = (vobj *) malloc_shared(rd*
sizeof(vobj),*theGridAccelerator);
121 for (
int r = 0; r<rd; r++) {
122 mysum[r] = vobj_zero;
127 auto rb_p = &reduction_buffer[0];
137 int ss= so+n*stride+b;
143 for (
int r = 0; r < rd; r++) {
144 theGridAccelerator->submit([&](sycl::handler &cgh) {
145 auto Reduction = sycl::reduction(&mysum[r],std::plus<>());
146 cgh.parallel_for(sycl::range<1>{subvol_size},
148 [=](sycl::id<1> item,
auto &
sum) {
150 sum += rb_p[r*subvol_size+s];
156 theGridAccelerator->wait();
157 for (
int r = 0; r < rd; r++) {
160 free(mysum,*theGridAccelerator);
166 std::vector<vobj> &lvSum,
174 typedef typename vobj::vector_type vector;
175 const int words =
sizeof(vobj)/
sizeof(vector);
176 const int osites = rd*e1*e2;
178 vector *dat = (vector *)Data;
179 vector *buf = &buffer[0];
180 std::vector<vector> lvSum_small(rd);
181 vector *lvSum_ptr = (vector *)&lvSum[0];
183 for (
int w = 0; w < words; w++) {
185 buf[ss] = dat[ss*words+w];
188 #if defined(GRID_CUDA) || defined(GRID_HIP)
189 sliceSumReduction_cub_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
190 #elif defined(GRID_SYCL)
191 sliceSumReduction_sycl_small(buf,lvSum_small,rd,e1,e2,stride, ostride,Nsimd);
194 for (
int r = 0; r < rd; r++) {
195 lvSum_ptr[w+words*r]=lvSum_small[r];
202 std::vector<vobj> &lvSum,
211 if constexpr (
sizeof(vobj) <= 256) {
213 #if defined(GRID_CUDA) || defined(GRID_HIP)
214 sliceSumReduction_cub_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
215 #elif defined (GRID_SYCL)
216 sliceSumReduction_sycl_small(&Data_v[0], lvSum, rd, e1, e2, stride, ostride, Nsimd);
228 std::vector<vobj> &lvSum,
241 for(
int n=0;n<e1;n++){
242 for(
int b=0;b<e2;b++){
243 int ss= so+n*stride+b;
244 lvSum[r]=lvSum[r]+Data_v[ss];
251 std::vector<vobj> &lvSum,
259#if defined(GRID_CUDA) || defined(GRID_HIP) || defined(GRID_SYCL)
void * acceleratorAllocDevice(size_t bytes)
acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes)
#define accelerator_for(iterator, num, nsimd,...)
#define accelerator_for2d(iter1, num1, iter2, num2, nsimd,...)
void acceleratorFreeDevice(void *ptr)
#define accelerator_barrier(dummy)
acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes)
std::vector< T, devAllocator< T > > deviceVector
accelerator_inline void zeroit(Grid_simd2< S, V > &z)
vobj::scalar_object sum(const vobj *arg, Integer osites)
void sliceSumReduction(const Lattice< vobj > &Data, std::vector< vobj > &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
void sliceSumReduction_cpu(const Lattice< vobj > &Data, std::vector< vobj > &lvSum, const int &rd, const int &e1, const int &e2, const int &stride, const int &ostride, const int &Nsimd)
void sliceSumReduction_large(const vobj *Data, std::vector< vobj > &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
void sliceSumReduction_gpu(const Lattice< vobj > &Data, std::vector< vobj > &lvSum, const int rd, const int e1, const int e2, const int stride, const int ostride, const int Nsimd)
#define autoView(l_v, l, mode)
GridLogger GridLogError(1, "Error", GridLogColours, "RED")
#define NAMESPACE_BEGIN(A)
accelerator_inline void coalescedWrite(vobj &__restrict__ vec, const vobj &__restrict__ extracted, int lane=0)
accelerator_inline vobj coalescedRead(const vobj &__restrict__ vec, int lane=0)
#define thread_for(i, num,...)