10#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID"
11#define ENV_RANK_PALS "PALS_RANKID"
12#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
13#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
14#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
15#define ENV_RANK_SLURM "SLURM_PROCID"
16#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
17#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
20cudaDeviceProp *gpu_props;
21cudaStream_t copyStream;
22cudaStream_t computeStream;
26 cudaGetDeviceCount(&nDevices);
27 gpu_props =
new cudaDeviceProp[nDevices];
29 char * localRankStr = NULL;
38 printf(
"OPENMPI detected\n");
39 rank = atoi(localRankStr);
42 printf(
"MVAPICH detected\n");
43 rank = atoi(localRankStr);
46 printf(
"SLURM detected\n");
47 rank = atoi(localRankStr);
50 printf(
"MPI version is unknown - bad things may happen\n");
53 size_t totalDeviceMem=0;
54 for (
int i = 0; i < nDevices; i++) {
56#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit[%d]: " #canMapHostMemory ": " FMT" \n",rank,prop.canMapHostMemory);
57#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
58 cudaGetDeviceProperties(&gpu_props[i], i);
61 totalDeviceMem = prop.totalGlobalMem;
64 printf(
"AcceleratorCudaInit[%d]: ========================\n",rank);
65 printf(
"AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i);
66 printf(
"AcceleratorCudaInit[%d]: ========================\n",rank);
67 printf(
"AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
70 GPU_PROP_FMT(totalGlobalMem,
"%zu");
71 GPU_PROP(managedMemory);
72 GPU_PROP(isMultiGpuBoard);
75 GPU_PROP(pciDeviceID);
76 printf(
"AcceleratorCudaInit[%d]: maxGridSize (%d,%d,%d)\n",rank,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
88#ifdef GRID_DEFAULT_GPU
92 printf(
"AcceleratorCudaInit: using default device \n");
93 printf(
"AcceleratorCudaInit: assume user either uses\n");
94 printf(
"AcceleratorCudaInit: a) IBM jsrun, or \n");
95 printf(
"AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
96 printf(
"AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
100 printf(
"AcceleratorCudaInit: rank %d setting device to node rank %d\n",
world_rank,rank);
101 printf(
"AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
104 cudaSetDevice(device);
105 cudaStreamCreate(©Stream);
106 cudaStreamCreate(&computeStream);
110 cudaDeviceGetPCIBusId(busid, len, device);
111 printf(
"local rank %d device %d bus id: %s\n", rank, device, busid);
114 if (
world_rank == 0 ) printf(
"AcceleratorCudaInit: ================================================\n");
119hipDeviceProp_t *gpu_props;
120hipStream_t copyStream;
121hipStream_t computeStream;
125 auto discard = hipGetDeviceCount(&nDevices);
126 gpu_props =
new hipDeviceProp_t[nDevices];
128 char * localRankStr = NULL;
134 rank = atoi(localRankStr);
138 rank = atoi(localRankStr);
145 printf(
"world_rank %d has %d devices\n",
world_rank,nDevices);
146 size_t totalDeviceMem=0;
147 for (
int i = 0; i < nDevices; i++) {
149#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
150#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
152 discard = hipGetDeviceProperties(&gpu_props[i], i);
153 hipDeviceProp_t prop;
155 totalDeviceMem = prop.totalGlobalMem;
157 printf(
"AcceleratorHipInit: ========================\n");
158 printf(
"AcceleratorHipInit: Device Number : %d\n", i);
159 printf(
"AcceleratorHipInit: ========================\n");
160 printf(
"AcceleratorHipInit: Device identifier: %s\n", prop.name);
162 GPU_PROP_FMT(totalGlobalMem,
"%lu");
164 GPU_PROP(isMultiGpuBoard);
175#ifdef GRID_DEFAULT_GPU
177 printf(
"AcceleratorHipInit: using default device \n");
178 printf(
"AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
179 printf(
"AcceleratorHipInit: Configure options --enable-setdevice=no \n");
184 printf(
"AcceleratorHipInit: rank %d setting device to node rank %d\n",
world_rank,rank);
185 printf(
"AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
189 discard = hipSetDevice(device);
190 discard = hipStreamCreate(©Stream);
191 discard = hipStreamCreate(&computeStream);
195 discard = hipDeviceGetPCIBusId(busid, len, device);
196 printf(
"local rank %d device %d bus id: %s\n", rank, device, busid);
198 if (
world_rank == 0 ) printf(
"AcceleratorHipInit: ================================================\n");
205sycl::queue *theGridAccelerator;
206sycl::queue *theCopyAccelerator;
212 theGridAccelerator =
new sycl::queue (sycl::gpu_selector_v);
213 theCopyAccelerator =
new sycl::queue (sycl::gpu_selector_v);
216#ifdef GRID_SYCL_LEVEL_ZERO_IPC
220 char * localRankStr = NULL;
227 rank = atoi(localRankStr);
231 rank = atoi(localRankStr);
235 rank = atoi(localRankStr);
243 if ( rank==0 ) printf(
"AcceleratorSyclInit world_rank %d is host %s \n",
world_rank,
hostname);
245 auto devices = sycl::device::get_devices();
246 for(
int d = 0;d<devices.size();d++){
248#define GPU_PROP_STR(prop) \
249 printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
251#define GPU_PROP_FMT(prop,FMT) \
252 printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
254#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
257 GPU_PROP_STR(vendor);
258 GPU_PROP_STR(version);
274 GPU_PROP(global_mem_size);
279 auto name = theGridAccelerator->get_device().get_info<sycl::info::device::name>();
280 printf(
"AcceleratorSyclInit: Selected device is %s\n",name.c_str());
281 printf(
"AcceleratorSyclInit: ================================================\n");
286#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
void acceleratorInit(void)
#define ENV_LOCAL_RANK_MVAPICH
uint32_t accelerator_threads
uint32_t acceleratorThreads(void)
#define ENV_LOCAL_RANK_PALS
#define ENV_LOCAL_RANK_OMPI
int acceleratorAbortOnGpuError
#define ENV_LOCAL_RANK_SLURM
char hostname[HOST_NAME_MAX+1]
#define NAMESPACE_BEGIN(A)
static uint64_t DeviceMaxBytes