Grid 0.7.0
Accelerator.h
Go to the documentation of this file.
1/*************************************************************************************
2
3 Grid physics library, www.github.com/paboyle/Grid
4
5 Source file: ./lib/Accelerator.h
6
7 Copyright (C) 2015
8
9Author: Peter Boyle <paboyle@ph.ed.ac.uk>
10Author: paboyle <paboyle@ph.ed.ac.uk>
11
12 This program is free software; you can redistribute it and/or modify
13 it under the terms of the GNU General Public License as published by
14 the Free Software Foundation; either version 2 of the License, or
15 (at your option) any later version.
16
17 This program is distributed in the hope that it will be useful,
18 but WITHOUT ANY WARRANTY; without even the implied warranty of
19 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20 GNU General Public License for more details.
21
22 You should have received a copy of the GNU General Public License along
23 with this program; if not, write to the Free Software Foundation, Inc.,
24 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
25
26 See the full license in the file "LICENSE" in the top level distribution directory
27*************************************************************************************/
28/* END LEGAL */
29#pragma once
30
31#include <string.h>
32
33#ifdef HAVE_MALLOC_MALLOC_H
34#include <malloc/malloc.h>
35#endif
36#ifdef HAVE_MALLOC_H
37#include <malloc.h>
38#endif
39#ifdef HAVE_MM_MALLOC_H
40#include <mm_malloc.h>
41#endif
42#ifdef __APPLE__
43// no memalign
44inline void *memalign(size_t align, size_t bytes) { return malloc(bytes); }
45#endif
46
48
50// Accelerator primitives; fall back to threading if not CUDA or SYCL
52//
53// Function attributes
54//
55// accelerator
56// accelerator_inline
57//
58// Parallel looping
59//
60// accelerator_for
61// accelerator_forNB
62// uint32_t accelerator_barrier(); // device synchronise
63//
64// Parallelism control: Number of threads in thread block is acceleratorThreads*Nsimd
65//
66// uint32_t acceleratorThreads(void);
67// void acceleratorThreads(uint32_t);
68//
69// Warp control and info:
70//
71// acceleratorInit;
72// void acceleratorSynchronise(void); // synch warp etc..
73// int acceleratorSIMTlane(int Nsimd);
74//
75// Memory management:
76//
77// int acceleratorIsCommunicable(void *pointer);
78// void *acceleratorAllocShared(size_t bytes);
79// void acceleratorFreeShared(void *ptr);
80//
81// void *acceleratorAllocDevice(size_t bytes);
82// void acceleratorFreeDevice(void *ptr);
83//
84// void *acceleratorCopyToDevice(void *from,void *to,size_t bytes);
85// void *acceleratorCopyFromDevice(void *from,void *to,size_t bytes);
86//
88
89uint32_t acceleratorThreads(void);
90void acceleratorThreads(uint32_t);
91void acceleratorInit(void);
92
94// CUDA acceleration
96
97#ifdef GRID_CUDA
98
99#include <cuda.h>
100
101#ifdef __CUDA_ARCH__
102#define GRID_SIMT
103#endif
104
105#define accelerator __host__ __device__
106#define accelerator_inline __host__ __device__ inline
107
109extern cudaStream_t copyStream;
110extern cudaStream_t computeStream;
111
113#ifdef GRID_SIMT
114 return threadIdx.x;
115#else
116 return 0;
117#endif
118} // CUDA specific
119
120inline void acceleratorMem(void)
121{
122 size_t free_t,total_t,used_t;
123 cudaMemGetInfo(&free_t,&total_t);
124 used_t=total_t-free_t;
125 std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
126}
127
128inline void cuda_mem(void)
129{
131}
132
133#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
134 { \
135 if ( num1*num2 ) { \
136 int nt=acceleratorThreads(); \
137 typedef uint64_t Iterator; \
138 auto lambda = [=] accelerator \
139 (Iterator iter1,Iterator iter2,Iterator lane) mutable { \
140 __VA_ARGS__; \
141 }; \
142 dim3 cu_threads(nsimd,acceleratorThreads(),1); \
143 dim3 cu_blocks ((num1+nt-1)/nt,num2,1); \
144 LambdaApply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,nsimd,lambda); \
145 } \
146 }
147
148#define accelerator_for6dNB(iter1, num1, \
149 iter2, num2, \
150 iter3, num3, \
151 iter4, num4, \
152 iter5, num5, \
153 iter6, num6, ... ) \
154 { \
155 typedef uint64_t Iterator; \
156 auto lambda = [=] accelerator \
157 (Iterator iter1,Iterator iter2, \
158 Iterator iter3,Iterator iter4, \
159 Iterator iter5,Iterator iter6) mutable { \
160 __VA_ARGS__; \
161 }; \
162 dim3 cu_blocks (num1,num2,num3); \
163 dim3 cu_threads(num4,num5,num6); \
164 Lambda6Apply<<<cu_blocks,cu_threads,0,computeStream>>>(num1,num2,num3,num4,num5,num6,lambda); \
165 }
166
167
168template<typename lambda> __global__
169void LambdaApply(uint64_t num1, uint64_t num2, uint64_t num3, lambda Lambda)
170{
171 // Weird permute is to make lane coalesce for large blocks
172 uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
173 uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
174 uint64_t z = threadIdx.x;
175 if ( (x < num1) && (y<num2) && (z<num3) ) {
176 Lambda(x,y,z);
177 }
178}
179
180template<typename lambda> __global__
181void Lambda6Apply(uint64_t num1, uint64_t num2, uint64_t num3,
182 uint64_t num4, uint64_t num5, uint64_t num6,
183 lambda Lambda)
184{
185 uint64_t iter1 = blockIdx.x;
186 uint64_t iter2 = blockIdx.y;
187 uint64_t iter3 = blockIdx.z;
188 uint64_t iter4 = threadIdx.x;
189 uint64_t iter5 = threadIdx.y;
190 uint64_t iter6 = threadIdx.z;
191
192 if ( (iter1 < num1) && (iter2<num2) && (iter3<num3)
193 && (iter4 < num4) && (iter5<num5) && (iter6<num6) )
194 {
195 Lambda(iter1,iter2,iter3,iter4,iter5,iter6);
196 }
197}
198
199#define accelerator_barrier(dummy) \
200 { \
201 cudaStreamSynchronize(computeStream); \
202 cudaError err = cudaGetLastError(); \
203 if ( cudaSuccess != err ) { \
204 printf("accelerator_barrier(): Cuda error %s \n", \
205 cudaGetErrorString( err )); \
206 printf("File %s Line %d\n",__FILE__,__LINE__); \
207 fflush(stdout); \
208 if (acceleratorAbortOnGpuError) assert(err==cudaSuccess); \
209 } \
210 }
211
212inline void *acceleratorAllocHost(size_t bytes)
213{
214 void *ptr=NULL;
215 auto err = cudaMallocHost((void **)&ptr,bytes);
216 if( err != cudaSuccess ) {
217 ptr = (void *) NULL;
218 printf(" cudaMallocHost failed for %zu %s \n",bytes,cudaGetErrorString(err));
219 assert(0);
220 }
221 return ptr;
222}
223inline void *acceleratorAllocShared(size_t bytes)
224{
225 void *ptr=NULL;
226 auto err = cudaMallocManaged((void **)&ptr,bytes);
227 if( err != cudaSuccess ) {
228 ptr = (void *) NULL;
229 printf(" cudaMallocManaged failed for %zu %s \n",bytes,cudaGetErrorString(err));
230 assert(0);
231 }
232 return ptr;
233};
234inline void *acceleratorAllocDevice(size_t bytes)
235{
236 void *ptr=NULL;
237 auto err = cudaMalloc((void **)&ptr,bytes);
238 if( err != cudaSuccess ) {
239 ptr = (void *) NULL;
240 printf(" cudaMalloc failed for %zu %s \n",bytes,cudaGetErrorString(err));
241 }
242 return ptr;
243};
244
245typedef int acceleratorEvent_t;
246
247inline void acceleratorFreeShared(void *ptr){ cudaFree(ptr);};
248inline void acceleratorFreeDevice(void *ptr){ cudaFree(ptr);};
249inline void acceleratorFreeHost(void *ptr){ cudaFree(ptr);};
250inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { cudaMemcpy(to,from,bytes, cudaMemcpyHostToDevice);}
251inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ cudaMemcpy(to,from,bytes, cudaMemcpyDeviceToHost);}
252inline void acceleratorMemSet(void *base,int value,size_t bytes) { cudaMemset(base,value,bytes);}
253inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
254 acceleratorCopyToDevice(from,to,bytes);
255 return 0;
256}
257inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, cudaStream_t stream = copyStream) {
258 acceleratorCopyFromDevice(from,to,bytes);
259 return 0;
260}
261inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
262{
263 cudaMemcpyAsync(to,from,bytes, cudaMemcpyDeviceToDevice,copyStream);
264 return 0;
265}
266inline void acceleratorCopySynchronise(void) { cudaStreamSynchronize(copyStream); };
268{
269 //auto discard=cudaStreamSynchronize(ev);
270}
272
273
274inline int acceleratorIsCommunicable(void *ptr)
275{
276 // int uvm=0;
277 // auto
278 // cuerr = cuPointerGetAttribute( &uvm, CU_POINTER_ATTRIBUTE_IS_MANAGED, (CUdeviceptr) ptr);
279 // assert(cuerr == cudaSuccess );
280 // if(uvm) return 0;
281 // else return 1;
282 return 1;
283}
284
285#endif
286
288// SyCL acceleration
290
291#ifdef GRID_SYCL
292#define GRID_SYCL_LEVEL_ZERO_IPC
293
295
296// Force deterministic reductions
297#define SYCL_REDUCTION_DETERMINISTIC
298#include <sycl/sycl.hpp>
299#include <sycl/usm.hpp>
300#include <level_zero/ze_api.h>
301#include <sycl/ext/oneapi/backend/level_zero.hpp>
302
304
305inline void acceleratorMem(void)
306{
307 std::cout <<" SYCL acceleratorMem not implemented"<<std::endl;
308}
309
310extern sycl::queue *theGridAccelerator;
311extern sycl::queue *theCopyAccelerator;
312
313#ifdef __SYCL_DEVICE_ONLY__
314#define GRID_SIMT
315#endif
316
317#define accelerator
318#define accelerator_inline strong_inline
319
321#ifdef GRID_SIMT
322 return __spirv::initLocalInvocationId<3, sycl::id<3>>()[2];
323#else
324 return 0;
325#endif
326} // SYCL specific
327
328#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
329 theGridAccelerator->submit([&](sycl::handler &cgh) { \
330 unsigned long nt=acceleratorThreads(); \
331 if(nt < 8)nt=8; \
332 unsigned long unum1 = num1; \
333 unsigned long unum2 = num2; \
334 unsigned long unum1_divisible_by_nt = ((unum1 + nt - 1) / nt) * nt; \
335 sycl::range<3> local {nt,1,nsimd}; \
336 sycl::range<3> global{unum1_divisible_by_nt,unum2,nsimd}; \
337 cgh.parallel_for( \
338 sycl::nd_range<3>(global,local), \
339 [=] (sycl::nd_item<3> item) /*mutable*/ \
340 [[sycl::reqd_sub_group_size(16)]] \
341 { \
342 auto iter1 = item.get_global_id(0); \
343 auto iter2 = item.get_global_id(1); \
344 auto lane = item.get_global_id(2); \
345 { if (iter1 < unum1){ __VA_ARGS__ } }; \
346 }); \
347 });
348
349#define accelerator_barrier(dummy) { theGridAccelerator->wait(); }
350
351inline void *acceleratorAllocShared(size_t bytes){ return malloc_shared(bytes,*theGridAccelerator);};
352inline void *acceleratorAllocHost(size_t bytes) { return malloc_host(bytes,*theGridAccelerator);};
353inline void *acceleratorAllocDevice(size_t bytes){ return malloc_device(bytes,*theGridAccelerator);};
354inline void acceleratorFreeHost(void *ptr){free(ptr,*theGridAccelerator);};
355inline void acceleratorFreeShared(void *ptr){free(ptr,*theGridAccelerator);};
356inline void acceleratorFreeDevice(void *ptr){free(ptr,*theGridAccelerator);};
357
358inline void acceleratorCopySynchronise(void) { theCopyAccelerator->wait(); }
359
360
362// Asynch event interface
364typedef sycl::event acceleratorEvent_t;
365
367{
368 ev.wait();
369}
370
372{
373 return (ev.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete);
374}
375
376inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes);}
377inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
378inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { return theCopyAccelerator->memcpy(to,from,bytes); }
379
380inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
381inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ theCopyAccelerator->memcpy(to,from,bytes); theCopyAccelerator->wait();}
382inline void acceleratorMemSet(void *base,int value,size_t bytes) { theCopyAccelerator->memset(base,value,bytes); theCopyAccelerator->wait();}
383
384inline int acceleratorIsCommunicable(void *ptr)
385{
386#if 0
387 auto uvm = sycl::usm::get_pointer_type(ptr, theGridAccelerator->get_context());
388 if ( uvm = sycl::usm::alloc::shared ) return 1;
389 else return 0;
390#endif
391 return 1;
392
393}
394
395
396#endif
397
399// HIP acceleration
401#ifdef GRID_HIP
403#include <hip/hip_runtime.h>
405
406#ifdef __HIP_DEVICE_COMPILE__
407#define GRID_SIMT
408#endif
409
410#define accelerator __host__ __device__
411#define accelerator_inline __host__ __device__ inline
412
413inline void acceleratorMem(void)
414{
415 size_t free_t,total_t,used_t;
416 auto discard = hipMemGetInfo(&free_t,&total_t);
417 used_t=total_t-free_t;
418 std::cout << " MemoryManager : GPU used "<<used_t<<" free "<<free_t<< " total "<<total_t<<std::endl;
419}
420
421
422extern hipStream_t copyStream;
423extern hipStream_t computeStream;
424/*These routines define mapping from thread grid to loop & vector lane indexing */
426#ifdef GRID_SIMT
427 return hipThreadIdx_x;
428#else
429 return 0;
430#endif
431} // HIP specific
432
433#define accelerator_for2dNB( iter1, num1, iter2, num2, nsimd, ... ) \
434 { \
435 typedef uint64_t Iterator; \
436 auto lambda = [=] accelerator \
437 (Iterator iter1,Iterator iter2,Iterator lane ) mutable { \
438 { __VA_ARGS__;} \
439 }; \
440 int nt=acceleratorThreads(); \
441 dim3 hip_threads(nsimd, nt, 1); \
442 dim3 hip_blocks ((num1+nt-1)/nt,num2,1); \
443 if(hip_threads.x * hip_threads.y * hip_threads.z <= 64){ \
444 hipLaunchKernelGGL(LambdaApply64,hip_blocks,hip_threads, \
445 0,computeStream, \
446 num1,num2,nsimd, lambda); \
447 } else { \
448 hipLaunchKernelGGL(LambdaApply,hip_blocks,hip_threads, \
449 0,computeStream, \
450 num1,num2,nsimd, lambda); \
451 } \
452 }
453
454template<typename lambda> __global__
455__launch_bounds__(64,1)
456void LambdaApply64(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
457{
458 // Following the same scheme as CUDA for now
459 uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
460 uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
461 uint64_t z = threadIdx.x;
462 if ( (x < numx) && (y<numy) && (z<numz) ) {
463 Lambda(x,y,z);
464 }
465}
466
467template<typename lambda> __global__
468__launch_bounds__(1024,1)
469void LambdaApply(uint64_t numx, uint64_t numy, uint64_t numz, lambda Lambda)
470{
471 // Following the same scheme as CUDA for now
472 uint64_t x = threadIdx.y + blockDim.y*blockIdx.x;
473 uint64_t y = threadIdx.z + blockDim.z*blockIdx.y;
474 uint64_t z = threadIdx.x;
475 if ( (x < numx) && (y<numy) && (z<numz) ) {
476 Lambda(x,y,z);
477 }
478}
479
480#define accelerator_barrier(dummy) \
481 { \
482 auto tmp=hipStreamSynchronize(computeStream); \
483 auto err = hipGetLastError(); \
484 if ( err != hipSuccess ) { \
485 printf("After hipDeviceSynchronize() : HIP error %s \n", hipGetErrorString( err )); \
486 puts(__FILE__); \
487 printf("Line %d\n",__LINE__); \
488 exit(0); \
489 } \
490 }
491
492inline void *acceleratorAllocHost(size_t bytes)
493{
494 void *ptr=NULL;
495 auto err = hipHostMalloc((void **)&ptr,bytes);
496 if( err != hipSuccess ) {
497 ptr = (void *) NULL;
498 fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
499 }
500 return ptr;
501};
502inline void *acceleratorAllocShared(size_t bytes)
503{
504 void *ptr=NULL;
505 auto err = hipMallocManaged((void **)&ptr,bytes);
506 if( err != hipSuccess ) {
507 ptr = (void *) NULL;
508 fprintf(stderr," hipMallocManaged failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
509 }
510 return ptr;
511};
512inline int acceleratorIsCommunicable(void *ptr){ return 1; }
513
514inline void *acceleratorAllocDevice(size_t bytes)
515{
516 void *ptr=NULL;
517 auto err = hipMalloc((void **)&ptr,bytes);
518 if( err != hipSuccess ) {
519 ptr = (void *) NULL;
520 fprintf(stderr," hipMalloc failed for %ld %s \n",bytes,hipGetErrorString(err)); fflush(stderr);
521 }
522 return ptr;
523};
524
525inline void acceleratorFreeHost(void *ptr){ auto discard=hipFree(ptr);};
526inline void acceleratorFreeShared(void *ptr){ auto discard=hipFree(ptr);};
527inline void acceleratorFreeDevice(void *ptr){ auto discard=hipFree(ptr);};
528inline void acceleratorCopyToDevice(const void *from,void *to,size_t bytes) { auto discard=hipMemcpy(to,from,bytes, hipMemcpyHostToDevice);}
529inline void acceleratorCopyFromDevice(const void *from,void *to,size_t bytes){ auto discard=hipMemcpy(to,from,bytes, hipMemcpyDeviceToHost);}
530
531inline void acceleratorMemSet(void *base,int value,size_t bytes) { auto discard=hipMemset(base,value,bytes);}
532
533typedef int acceleratorEvent_t;
534
535inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) // Asynch
536{
537 //auto discard=hipMemcpyDtoDAsync(to, from, bytes, copyStream);
538 void* from_c = const_cast<void*>(from);
539 auto discard=hipMemcpyDtoDAsync(to, from_c, bytes, copyStream);
540 return 0;
541}
542inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
543 acceleratorCopyToDevice(from,to,bytes);
544 return 0;
545}
546inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes, hipStream_t stream = copyStream) {
547 acceleratorCopyFromDevice(from,to,bytes);
548 return 0;
549}
550inline void acceleratorCopySynchronise(void) { auto discard=hipStreamSynchronize(copyStream); };
551
553{
554 // auto discard=hipStreamSynchronize(ev);
555}
557
558
559#endif
560
561inline void acceleratorPin(void *ptr,unsigned long bytes)
562{
563#ifdef GRID_SYCL
564 sycl::ext::oneapi::experimental::prepare_for_device_copy(ptr,bytes,theCopyAccelerator->get_context());
565#endif
566}
567
569// Common on all GPU targets
571#if defined(GRID_SYCL) || defined(GRID_CUDA) || defined(GRID_HIP)
572// FIXME -- the non-blocking nature got broken March 30 2023 by PAB
573#define accelerator_forNB( iter1, num1, nsimd, ... ) accelerator_for2dNB( iter1, num1, iter2, 1, nsimd, {__VA_ARGS__} );
574
575#define accelerator_for( iter, num, nsimd, ... ) \
576 accelerator_forNB(iter, num, nsimd, { __VA_ARGS__ } ); \
577 accelerator_barrier(dummy);
578
579#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) \
580 accelerator_for2dNB(iter1, num1, iter2, num2, nsimd, { __VA_ARGS__ } ); \
581 accelerator_barrier(dummy);
582
583#define GRID_ACCELERATED
584
585#endif
586
588// CPU Target - No accelerator just thread instead
590
591#if ( (!defined(GRID_SYCL)) && (!defined(GRID_CUDA)) && (!defined(GRID_HIP)) )
592
593#undef GRID_SIMT
594
596
597inline void acceleratorMem(void)
598{
599 /*
600 struct rusage rusage;
601 getrusage( RUSAGE_SELF, &rusage );
602 return (size_t)rusage.ru_maxrss;
603 */
604 std::cout <<" system acceleratorMem not implemented"<<std::endl;
605}
606
607#define accelerator
608#define accelerator_inline strong_inline
609#define accelerator_for(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
610#define accelerator_forNB(iterator,num,nsimd, ... ) thread_for(iterator, num, { __VA_ARGS__ });
611#define accelerator_barrier(dummy)
612#define accelerator_for2d(iter1, num1, iter2, num2, nsimd, ... ) thread_for2d(iter1,num1,iter2,num2,{ __VA_ARGS__ });
613
614accelerator_inline int acceleratorSIMTlane(int Nsimd) { return 0; } // CUDA specific
615
616inline void acceleratorCopyToDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
617inline void acceleratorCopyFromDevice(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); }
618inline acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyToDevice(from,to,bytes); return 0; }
619inline acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from,void *to,size_t bytes) { acceleratorCopyFromDevice(from,to,bytes); return 0; }
622inline acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from,void *to,size_t bytes) { thread_bcopy(from,to,bytes); return 0;}
623
624inline void acceleratorCopySynchronise(void) {};
625
626inline int acceleratorIsCommunicable(void *ptr){ return 1; }
627inline void acceleratorMemSet(void *base,int value,size_t bytes) { memset(base,value,bytes);}
628#ifdef HAVE_MM_MALLOC_H
629inline void *acceleratorAllocHost(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
630inline void *acceleratorAllocShared(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
631inline void *acceleratorAllocDevice(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
632inline void acceleratorFreeHost(void *ptr){_mm_free(ptr);};
633inline void acceleratorFreeShared(void *ptr){_mm_free(ptr);};
634inline void acceleratorFreeDevice(void *ptr){_mm_free(ptr);};
635#else
636inline void *acceleratorAllocShared(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
637inline void *acceleratorAllocDevice(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
638inline void acceleratorFreeShared(void *ptr){free(ptr);};
639inline void acceleratorFreeDevice(void *ptr){free(ptr);};
640#endif
641
642#endif // CPU target
643
644#ifdef HAVE_MM_MALLOC_H
645inline void *acceleratorAllocCpu(size_t bytes){return _mm_malloc(bytes,GRID_ALLOC_ALIGN);};
646inline void acceleratorFreeCpu (void *ptr){_mm_free(ptr);};
647#else
648inline void *acceleratorAllocCpu(size_t bytes){return memalign(GRID_ALLOC_ALIGN,bytes);};
649inline void acceleratorFreeCpu (void *ptr){free(ptr);};
650#endif
651
653// Fencing needed ONLY for SYCL
655
656#ifdef GRID_SYCL
657inline void acceleratorFenceComputeStream(void){ theGridAccelerator->ext_oneapi_submit_barrier(); };
658#else
659// Ordering within a stream guaranteed on Nvidia & AMD
660inline void acceleratorFenceComputeStream(void){ };
661#endif
662
664// Synchronise across local threads for divergence resynch
666accelerator_inline void acceleratorSynchronise(void) // Only Nvidia needs
667{
668#ifdef GRID_SIMT
669#ifdef GRID_CUDA
670 __syncwarp();
671#endif
672#endif
673 return;
674}
676{
677#ifdef GRID_SIMT
678#ifdef GRID_CUDA
679 __syncthreads();
680#endif
681#ifdef GRID_SYCL
682 // No barrier call on SYCL?? // Option get __spir:: stuff to do warp barrier
683#endif
684#ifdef GRID_HIP
685 __syncthreads();
686#endif
687#endif
688 return;
689}
691{
692#ifdef GRID_SIMT
693#ifdef GRID_CUDA
694 __threadfence();
695#endif
696#ifdef GRID_SYCL
697 // FIXMEE
698#endif
699#ifdef GRID_HIP
700 __threadfence();
701#endif
702#endif
703 return;
704}
705
706inline void acceleratorCopyDeviceToDevice(void *from,void *to,size_t bytes)
707{
710}
711
712template<class T> void acceleratorPut(T& dev,const T&host)
713{
714 acceleratorCopyToDevice((void *)&host,&dev,sizeof(T));
715}
716template<class T> T acceleratorGet(T& dev)
717{
718 T host;
719 acceleratorCopyFromDevice(&dev,&host,sizeof(T));
720 return host;
721}
722
723
724
725
int acceleratorAbortOnGpuError
Definition Accelerator.cc:5
void acceleratorInit(void)
int acceleratorEvent_t
void acceleratorPut(T &dev, const T &host)
accelerator_inline int acceleratorSIMTlane(int Nsimd)
void acceleratorMem(void)
#define accelerator_inline
accelerator_inline void acceleratorSynchroniseAll(void)
void acceleratorCopySynchronise(void)
void * acceleratorAllocShared(size_t bytes)
int acceleratorIsCommunicable(void *ptr)
int acceleratorEventIsComplete(acceleratorEvent_t ev)
T acceleratorGet(T &dev)
void acceleratorFenceComputeStream(void)
uint32_t acceleratorThreads(void)
Definition Accelerator.cc:7
void * acceleratorAllocDevice(size_t bytes)
void acceleratorCopyToDevice(void *from, void *to, size_t bytes)
acceleratorEvent_t acceleratorCopyDeviceToDeviceAsynch(void *from, void *to, size_t bytes)
accelerator_inline void acceleratorSynchronise(void)
void acceleratorFreeShared(void *ptr)
acceleratorEvent_t acceleratorCopyFromDeviceAsynch(void *from, void *to, size_t bytes)
accelerator_inline void acceleratorFence(void)
void acceleratorFreeCpu(void *ptr)
void acceleratorPin(void *ptr, unsigned long bytes)
void acceleratorMemSet(void *base, int value, size_t bytes)
void * acceleratorAllocCpu(size_t bytes)
void acceleratorEventWait(acceleratorEvent_t ev)
void acceleratorCopyDeviceToDevice(void *from, void *to, size_t bytes)
void acceleratorCopyFromDevice(void *from, void *to, size_t bytes)
void acceleratorFreeDevice(void *ptr)
acceleratorEvent_t acceleratorCopyToDeviceAsynch(void *from, void *to, size_t bytes)
#define GRID_ALLOC_ALIGN
Definition Config.h:65
#define NAMESPACE_BEGIN(A)
Definition Namespace.h:35
#define NAMESPACE_END(A)
Definition Namespace.h:36
void thread_bcopy(const void *from, void *to, size_t bytes)
Definition Threads.h:87
uint64_t base