Grid 0.7.0
Accelerator.cc
Go to the documentation of this file.
1#include <Grid/GridCore.h>
2
4int world_rank; // Use to control world rank for print guarding
7uint32_t acceleratorThreads(void) {return accelerator_threads;};
8void acceleratorThreads(uint32_t t) {accelerator_threads = t;};
9
10#define ENV_LOCAL_RANK_PALS "PALS_LOCAL_RANKID"
11#define ENV_RANK_PALS "PALS_RANKID"
12#define ENV_LOCAL_RANK_OMPI "OMPI_COMM_WORLD_LOCAL_RANK"
13#define ENV_RANK_OMPI "OMPI_COMM_WORLD_RANK"
14#define ENV_LOCAL_RANK_SLURM "SLURM_LOCALID"
15#define ENV_RANK_SLURM "SLURM_PROCID"
16#define ENV_LOCAL_RANK_MVAPICH "MV2_COMM_WORLD_LOCAL_RANK"
17#define ENV_RANK_MVAPICH "MV2_COMM_WORLD_RANK"
18
19#ifdef GRID_CUDA
20cudaDeviceProp *gpu_props;
21cudaStream_t copyStream;
22cudaStream_t computeStream;
23void acceleratorInit(void)
24{
25 int nDevices = 1;
26 cudaGetDeviceCount(&nDevices);
27 gpu_props = new cudaDeviceProp[nDevices];
28
29 char * localRankStr = NULL;
30 int rank = 0;
31 world_rank=0;
32 if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
33 if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
34 if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
35 // We extract the local rank initialization using an environment variable
36 if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL) {
37 if (!world_rank)
38 printf("OPENMPI detected\n");
39 rank = atoi(localRankStr);
40 } else if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL) {
41 if (!world_rank)
42 printf("MVAPICH detected\n");
43 rank = atoi(localRankStr);
44 } else if ((localRankStr = getenv(ENV_LOCAL_RANK_SLURM)) != NULL) {
45 if (!world_rank)
46 printf("SLURM detected\n");
47 rank = atoi(localRankStr);
48 } else {
49 if (!world_rank)
50 printf("MPI version is unknown - bad things may happen\n");
51 }
52
53 size_t totalDeviceMem=0;
54 for (int i = 0; i < nDevices; i++) {
55
56#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorCudaInit[%d]: " #canMapHostMemory ": " FMT" \n",rank,prop.canMapHostMemory);
57#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
58 cudaGetDeviceProperties(&gpu_props[i], i);
59 cudaDeviceProp prop;
60 prop = gpu_props[i];
61 totalDeviceMem = prop.totalGlobalMem;
62 if ( world_rank == 0) {
63 if ( i==rank ) {
64 printf("AcceleratorCudaInit[%d]: ========================\n",rank);
65 printf("AcceleratorCudaInit[%d]: Device Number : %d\n", rank,i);
66 printf("AcceleratorCudaInit[%d]: ========================\n",rank);
67 printf("AcceleratorCudaInit[%d]: Device identifier: %s\n",rank, prop.name);
68
69
70 GPU_PROP_FMT(totalGlobalMem,"%zu");
71 GPU_PROP(managedMemory);
72 GPU_PROP(isMultiGpuBoard);
73 GPU_PROP(warpSize);
74 GPU_PROP(pciBusID);
75 GPU_PROP(pciDeviceID);
76 printf("AcceleratorCudaInit[%d]: maxGridSize (%d,%d,%d)\n",rank,prop.maxGridSize[0],prop.maxGridSize[1],prop.maxGridSize[2]);
77 }
78 // GPU_PROP(unifiedAddressing);
79 // GPU_PROP(l2CacheSize);
80 // GPU_PROP(singleToDoublePrecisionPerfRatio);
81 }
82 }
83
84 MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
85#undef GPU_PROP_FMT
86#undef GPU_PROP
87
88#ifdef GRID_DEFAULT_GPU
89 int device = 0;
90 // IBM Jsrun makes cuda Device numbering screwy and not match rank
91 if ( world_rank == 0 ) {
92 printf("AcceleratorCudaInit: using default device \n");
93 printf("AcceleratorCudaInit: assume user either uses\n");
94 printf("AcceleratorCudaInit: a) IBM jsrun, or \n");
95 printf("AcceleratorCudaInit: b) invokes through a wrapping script to set CUDA_VISIBLE_DEVICES, UCX_NET_DEVICES, and numa binding \n");
96 printf("AcceleratorCudaInit: Configure options --enable-setdevice=no \n");
97 }
98#else
99 int device = rank;
100 printf("AcceleratorCudaInit: rank %d setting device to node rank %d\n",world_rank,rank);
101 printf("AcceleratorCudaInit: Configure options --enable-setdevice=yes \n");
102#endif
103
104 cudaSetDevice(device);
105 cudaStreamCreate(&copyStream);
106 cudaStreamCreate(&computeStream);
107 const int len=64;
108 char busid[len];
109 if( rank == world_rank ) {
110 cudaDeviceGetPCIBusId(busid, len, device);
111 printf("local rank %d device %d bus id: %s\n", rank, device, busid);
112 }
113
114 if ( world_rank == 0 ) printf("AcceleratorCudaInit: ================================================\n");
115}
116#endif
117
118#ifdef GRID_HIP
119hipDeviceProp_t *gpu_props;
120hipStream_t copyStream;
121hipStream_t computeStream;
122void acceleratorInit(void)
123{
124 int nDevices = 1;
125 auto discard = hipGetDeviceCount(&nDevices);
126 gpu_props = new hipDeviceProp_t[nDevices];
127
128 char * localRankStr = NULL;
129 int rank = 0;
130 world_rank=0;
131 // We extract the local rank initialization using an environment variable
132 if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
133 {
134 rank = atoi(localRankStr);
135 }
136 if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
137 {
138 rank = atoi(localRankStr);
139 }
140 if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
141 if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
142 if ((localRankStr = getenv(ENV_RANK_SLURM )) != NULL) { world_rank = atoi(localRankStr);}
143
144 if ( world_rank == 0 )
145 printf("world_rank %d has %d devices\n",world_rank,nDevices);
146 size_t totalDeviceMem=0;
147 for (int i = 0; i < nDevices; i++) {
148
149#define GPU_PROP_FMT(canMapHostMemory,FMT) printf("AcceleratorHipInit: " #canMapHostMemory ": " FMT" \n",prop.canMapHostMemory);
150#define GPU_PROP(canMapHostMemory) GPU_PROP_FMT(canMapHostMemory,"%d");
151
152 discard = hipGetDeviceProperties(&gpu_props[i], i);
153 hipDeviceProp_t prop;
154 prop = gpu_props[i];
155 totalDeviceMem = prop.totalGlobalMem;
156 if ( world_rank == 0) {
157 printf("AcceleratorHipInit: ========================\n");
158 printf("AcceleratorHipInit: Device Number : %d\n", i);
159 printf("AcceleratorHipInit: ========================\n");
160 printf("AcceleratorHipInit: Device identifier: %s\n", prop.name);
161
162 GPU_PROP_FMT(totalGlobalMem,"%lu");
163 // GPU_PROP(managedMemory);
164 GPU_PROP(isMultiGpuBoard);
165 GPU_PROP(warpSize);
166 // GPU_PROP(unifiedAddressing);
167 // GPU_PROP(l2CacheSize);
168 // GPU_PROP(singleToDoublePrecisionPerfRatio);
169 }
170 }
171 MemoryManager::DeviceMaxBytes = (8*totalDeviceMem)/10; // Assume 80% ours
172#undef GPU_PROP_FMT
173#undef GPU_PROP
174
175#ifdef GRID_DEFAULT_GPU
176 if ( world_rank == 0 ) {
177 printf("AcceleratorHipInit: using default device \n");
178 printf("AcceleratorHipInit: assume user or srun sets ROCR_VISIBLE_DEVICES and numa binding \n");
179 printf("AcceleratorHipInit: Configure options --enable-setdevice=no \n");
180 }
181 int device = 0;
182#else
183 if ( world_rank == 0 ) {
184 printf("AcceleratorHipInit: rank %d setting device to node rank %d\n",world_rank,rank);
185 printf("AcceleratorHipInit: Configure options --enable-setdevice=yes \n");
186 }
187 int device = rank;
188#endif
189 discard = hipSetDevice(device);
190 discard = hipStreamCreate(&copyStream);
191 discard = hipStreamCreate(&computeStream);
192 const int len=64;
193 char busid[len];
194 if( rank == world_rank ) {
195 discard = hipDeviceGetPCIBusId(busid, len, device);
196 printf("local rank %d device %d bus id: %s\n", rank, device, busid);
197 }
198 if ( world_rank == 0 ) printf("AcceleratorHipInit: ================================================\n");
199}
200#endif
201
202
203#ifdef GRID_SYCL
204
205sycl::queue *theGridAccelerator;
206sycl::queue *theCopyAccelerator;
207void acceleratorInit(void)
208{
209 int nDevices = 1;
210 // sycl::gpu_selector selector;
211 // sycl::device selectedDevice { selector };
212 theGridAccelerator = new sycl::queue (sycl::gpu_selector_v);
213 theCopyAccelerator = new sycl::queue (sycl::gpu_selector_v);
214 // theCopyAccelerator = theGridAccelerator; // Should proceed concurrenlty anyway.
215
216#ifdef GRID_SYCL_LEVEL_ZERO_IPC
217 zeInit(0);
218#endif
219
220 char * localRankStr = NULL;
221 int rank = 0;
222 world_rank=0;
223
224 // We extract the local rank initialization using an environment variable
225 if ((localRankStr = getenv(ENV_LOCAL_RANK_OMPI)) != NULL)
226 {
227 rank = atoi(localRankStr);
228 }
229 if ((localRankStr = getenv(ENV_LOCAL_RANK_MVAPICH)) != NULL)
230 {
231 rank = atoi(localRankStr);
232 }
233 if ((localRankStr = getenv(ENV_LOCAL_RANK_PALS)) != NULL)
234 {
235 rank = atoi(localRankStr);
236 }
237 if ((localRankStr = getenv(ENV_RANK_OMPI )) != NULL) { world_rank = atoi(localRankStr);}
238 if ((localRankStr = getenv(ENV_RANK_MVAPICH)) != NULL) { world_rank = atoi(localRankStr);}
239 if ((localRankStr = getenv(ENV_RANK_PALS )) != NULL) { world_rank = atoi(localRankStr);}
240
241 char hostname[HOST_NAME_MAX+1];
242 gethostname(hostname, HOST_NAME_MAX+1);
243 if ( rank==0 ) printf("AcceleratorSyclInit world_rank %d is host %s \n",world_rank,hostname);
244
245 auto devices = sycl::device::get_devices();
246 for(int d = 0;d<devices.size();d++){
247
248#define GPU_PROP_STR(prop) \
249 printf("AcceleratorSyclInit: " #prop ": %s \n",devices[d].get_info<sycl::info::device::prop>().c_str());
250
251#define GPU_PROP_FMT(prop,FMT) \
252 printf("AcceleratorSyclInit: " #prop ": " FMT" \n",devices[d].get_info<sycl::info::device::prop>());
253
254#define GPU_PROP(prop) GPU_PROP_FMT(prop,"%ld");
255 if ( world_rank == 0) {
256
257 GPU_PROP_STR(vendor);
258 GPU_PROP_STR(version);
259 // GPU_PROP_STR(device_type);
260 /*
261 GPU_PROP(max_compute_units);
262 GPU_PROP(native_vector_width_char);
263 GPU_PROP(native_vector_width_short);
264 GPU_PROP(native_vector_width_int);
265 GPU_PROP(native_vector_width_long);
266 GPU_PROP(native_vector_width_float);
267 GPU_PROP(native_vector_width_double);
268 GPU_PROP(native_vector_width_half);
269 GPU_PROP(address_bits);
270 GPU_PROP(half_fp_config);
271 GPU_PROP(single_fp_config);
272 */
273 // GPU_PROP(double_fp_config);
274 GPU_PROP(global_mem_size);
275 }
276
277 }
278 if ( world_rank == 0 ) {
279 auto name = theGridAccelerator->get_device().get_info<sycl::info::device::name>();
280 printf("AcceleratorSyclInit: Selected device is %s\n",name.c_str());
281 printf("AcceleratorSyclInit: ================================================\n");
282 }
283}
284#endif
285
286#if (!defined(GRID_CUDA)) && (!defined(GRID_SYCL))&& (!defined(GRID_HIP))
287void acceleratorInit(void){}
288#endif
289
int world_rank
Definition Accelerator.cc:4
void acceleratorInit(void)
#define ENV_LOCAL_RANK_MVAPICH
uint32_t accelerator_threads
Definition Accelerator.cc:6
#define ENV_RANK_PALS
#define ENV_RANK_OMPI
uint32_t acceleratorThreads(void)
Definition Accelerator.cc:7
#define ENV_RANK_SLURM
#define ENV_LOCAL_RANK_PALS
#define ENV_RANK_MVAPICH
#define ENV_LOCAL_RANK_OMPI
int acceleratorAbortOnGpuError
Definition Accelerator.cc:5
#define ENV_LOCAL_RANK_SLURM
char hostname[HOST_NAME_MAX+1]
Definition Init.cc:101
#define HOST_NAME_MAX
Definition Init.cc:85
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
static uint64_t DeviceMaxBytes