33#ifdef HAVE_MALLOC_MALLOC_H
34#include <malloc/malloc.h>
39#ifdef HAVE_MM_MALLOC_H
44inline void *memalign(
size_t align,
size_t bytes) {
return malloc(bytes); }
105#define accelerator __host__ __device__
106#define accelerator_inline __host__ __device__ inline
109extern cudaStream_t copyStream;
110extern cudaStream_t computeStream;
122 size_t free_t,total_t,used_t;
123 cudaMemGetInfo(&free_t,&total_t);
124 used_t=total_t-free_t;
125 std::cout <<
" MemoryManager : GPU used "<<used_t<<
" free "<<free_t<<
" total "<<total_t<<std::endl;
128inline void cuda_mem(
void)
133#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
136 int nt=acceleratorThreads(); \
137 typedef uint64_t Iterator; \
138 auto lambda = [=] accelerator \
139 (Iterator iter1,Iterator iter2,Iterator lane) mutable { \
142 dim3 cu_threads(nsimd,acceleratorThreads(),1); \
143 dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
144 LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
148#define accelerator_for6dNB(iter1, num1, \
155 typedef uint64_t Iterator; \
156 auto lambda = [=] accelerator \
157 (Iterator iter1,Iterator iter2, \
158 Iterator iter3,Iterator iter4, \
159 Iterator iter5,Iterator iter6) mutable { \
162 dim3 cu_blocks (num1,num2,num3); \
163 dim3 cu_threads(num4,num5,num6); \
164 Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
168template<
typename lambda> __global__
169void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
172 uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
173 uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
174 uint64_t z = threadIdx.x;
175 if ( (x < num1) && (y<num2) && (z<num3) ) {
180template<
typename lambda> __global__
181void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
182 uint64_t num4, uint64_t num5, uint64_t num6,
185 uint64_t iter1 = blockIdx.x;
186 uint64_t iter2 = blockIdx.y;
187 uint64_t iter3 = blockIdx.z;
188 uint64_t iter4 = threadIdx.x;
189 uint64_t iter5 = threadIdx.y;
190 uint64_t iter6 = threadIdx.z;
192 if ( (iter1 < num1) && (iter2<num2) && (iter3<num3)
193 && (iter4 < num4) && (iter5<num5) && (iter6<num6) )
195 Lambda(iter1,iter2,iter3,iter4,iter5,iter6);
199#define accelerator_barrier(dummy) \
201 cudaStreamSynchronize(computeStream); \
202 cudaError err = cudaGetLastError(); \
203 if ( cudaSuccess != err ) { \
204 printf("accelerator_barrier(): Cuda error %s \n", \
205 cudaGetErrorString( err )); \
206 printf("File %s Line %d\n",__FILE__,__LINE__); \
208 if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \
212inline void *acceleratorAllocHost(
size_t bytes)
215 auto err = cudaMallocHost((
void **)&ptr,bytes);
216 if( err != cudaSuccess ) {
218 printf(
" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
226 auto err = cudaMallocManaged((
void **)&ptr,bytes);
227 if( err != cudaSuccess ) {
229 printf(
" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
237 auto err = cudaMalloc((
void **)&ptr,bytes);
238 if( err != cudaSuccess ) {
240 printf(
" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
249inline void acceleratorFreeHost(
void *ptr){ cudaFree(ptr);};
250inline void acceleratorCopyToDevice(
const void *from,
void *to,
size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
251inline void acceleratorCopyFromDevice(
const void *from,
void *to,
size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
263 cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
292#define GRID_SYCL_LEVEL_ZERO_IPC
297#define SYCL_REDUCTION_DETERMINISTIC
298#include <sycl/sycl.hpp>
299#include <sycl/usm.hpp>
300#include <level_zero/ze_api.h>
301#include <sycl/ext/oneapi/backend/level_zero.hpp>
307 std::cout <<
" SYCL acceleratorMem not implemented"<<std::endl;
310extern sycl::queue *theGridAccelerator;
311extern sycl::queue *theCopyAccelerator;
313#ifdef __SYCL_DEVICE_ONLY__
318#define accelerator_inline strong_inline
322 return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];
328#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
329 theGridAccelerator->submit([&](sycl::handler &cgh) { \
330 unsigned long nt=acceleratorThreads(); \
332 unsigned long unum1 = num1; \
333 unsigned long unum2 = num2; \
334 unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
335 sycl::range<3> local {nt,1,nsimd}; \
336 sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
338 sycl::nd_range<3>(global,local), \
339 [=] (sycl::nd_item<3> item) \
340 [[sycl::reqd_sub_group_size(16)]] \
342 auto iter1 = item.get_global_id(0); \
343 auto iter2 = item.get_global_id(1); \
344 auto lane = item.get_global_id(2); \
345 { if (iter1 < unum1){ __VA_ARGS__ } }; \
349#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
352inline void *acceleratorAllocHost(
size_t bytes) {
return malloc_host(bytes,*theGridAccelerator);};
354inline void acceleratorFreeHost(
void *ptr){free(ptr,*theGridAccelerator);};
373 return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
380inline void acceleratorCopyToDevice(
const void *from,
void *to,
size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
381inline void acceleratorCopyFromDevice(
const void *from,
void *to,
size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
382inline void acceleratorMemSet(
void *
base,
int value,
size_t bytes) { theCopyAccelerator->memset(
base,value,bytes); theCopyAccelerator->wait();}
387 auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
388 if ( uvm = sycl::usm::alloc::shared )
return 1;
403#include <hip/hip_runtime.h>
406#ifdef __HIP_DEVICE_COMPILE__
410#define accelerator __host__ __device__
411#define accelerator_inline __host__ __device__ inline
415 size_t free_t,total_t,used_t;
416 auto discard = hipMemGetInfo(&free_t,&total_t);
417 used_t=total_t-free_t;
418 std::cout <<
" MemoryManager : GPU used "<<used_t<<
" free "<<free_t<<
" total "<<total_t<<std::endl;
422extern hipStream_t copyStream;
423extern hipStream_t computeStream;
427 return hipThreadIdx_x;
433#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
435 typedef uint64_t Iterator; \
436 auto lambda = [=] accelerator \
437 (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \
440 int nt=acceleratorThreads(); \
441 dim3 hip_threads(nsimd, nt, 1); \
442 dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
443 if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
444 hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
446 num1,num2,nsimd, lambda); \
448 hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
450 num1,num2,nsimd, lambda); \
454template<
typename lambda> __global__
455__launch_bounds__(64,1)
456void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
459 uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
460 uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
461 uint64_t z = threadIdx.x;
462 if ( (x < numx) && (y<numy) && (z<numz) ) {
467template<
typename lambda> __global__
468__launch_bounds__(1024,1)
469void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
472 uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
473 uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
474 uint64_t z = threadIdx.x;
475 if ( (x < numx) && (y<numy) && (z<numz) ) {
480#define accelerator_barrier(dummy) \
482 auto tmp=hipStreamSynchronize(computeStream); \
483 auto err = hipGetLastError(); \
484 if ( err != hipSuccess ) { \
485 printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
487 printf("Line %d\n",__LINE__); \
492inline void *acceleratorAllocHost(
size_t bytes)
495 auto err = hipHostMalloc((
void **)&ptr,bytes);
496 if( err != hipSuccess ) {
498 fprintf(stderr,
" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
505 auto err = hipMallocManaged((
void **)&ptr,bytes);
506 if( err != hipSuccess ) {
508 fprintf(stderr,
" hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
517 auto err = hipMalloc((
void **)&ptr,bytes);
518 if( err != hipSuccess ) {
520 fprintf(stderr,
" hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
525inline void acceleratorFreeHost(
void *ptr){
auto discard=hipFree(ptr);};
528inline void acceleratorCopyToDevice(
const void *from,
void *to,
size_t bytes) {
auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
529inline void acceleratorCopyFromDevice(
const void *from,
void *to,
size_t bytes){
auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
538 void* from_c =
const_cast<void*
>(from);
539 auto discard=hipMemcpyDtoDAsync(to, from_c, bytes, copyStream);
564 sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
571#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
573#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
575#define accelerator_for( iter, num, nsimd, ... ) \
576 accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
577 accelerator_barrier(dummy);
579#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
580 accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
581 accelerator_barrier(dummy);
583#define GRID_ACCELERATED
591#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
604 std::cout <<
" system acceleratorMem not implemented"<<std::endl;
608#define accelerator_inline strong_inline
609#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
610#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
611#define accelerator_barrier(dummy)
612#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
628#ifdef HAVE_MM_MALLOC_H
629inline void *acceleratorAllocHost(
size_t bytes){
return _mm_malloc(bytes,
GRID_ALLOC_ALIGN);};
632inline void acceleratorFreeHost(
void *ptr){_mm_free(ptr);};
644#ifdef HAVE_MM_MALLOC_H
int acceleratorAbortOnGpuError
void acceleratorInit(void)
void acceleratorPut(T &dev, const T &host)
accelerator_inline int acceleratorSIMTlane(int Nsimd)
void acceleratorMem(void)
#define accelerator_inline
accelerator_inline void acceleratorSynchroniseAll(void)
void acceleratorCopySynchronise(void)
void * acceleratorAllocShared(size_t bytes)
int acceleratorIsCommunicable(void *ptr)
int acceleratorEventIsComplete(acceleratorEvent_t ev)
void acceleratorFenceComputeStream(void)
uint32_t acceleratorThreads(void)
void * acceleratorAllocDevice(size_t bytes)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from, void *to, size_t bytes)
accelerator_inline void acceleratorSynchronise(void)
void acceleratorFreeShared(void *ptr)
acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes)
accelerator_inline void acceleratorFence(void)
void acceleratorFreeCpu(void *ptr)
void acceleratorPin(void *ptr, unsigned long bytes)
void acceleratorMemSet(void *base, int value, size_t bytes)
void * acceleratorAllocCpu(size_t bytes)
void acceleratorEventWait(acceleratorEvent_t ev)
void acceleratorCopyDeviceToDevice(void *from, void *to, size_t bytes)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
void acceleratorFreeDevice(void *ptr)
acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes)
#define NAMESPACE_BEGIN(A)
void thread_bcopy(const void *from, void *to, size_t bytes)