Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2026 */
9/* */
10/* Runtime for 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/CudaAccelerator.h"
15
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
18
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
35
36#include "arccore/accelerator_native/runtime/Cupti.h"
37
38#include <sstream>
39#include <unordered_map>
40#include <mutex>
41#include <algorithm>
42
43#include <cuda.h>
44
45// For std::memset
46#include <cstring>
47
48#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
49#include <nvtx3/nvToolsExt.h>
50#endif
51
52namespace Arcane::Accelerator::Cuda
53{
54using Impl::KernelLaunchArgs;
55
56namespace
57{
58 Int32 global_cupti_flush = 0;
59 CuptiInfo global_cupti_info;
60} // namespace
61
62/*---------------------------------------------------------------------------*/
63/*---------------------------------------------------------------------------*/
64
65// Starting from CUDA 13, there is a new cudaMemLocation type
66// for methods such as cudeMemAdvise or cudaMemPrefetch
67#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
68inline cudaMemLocation
69_getMemoryLocation(int device_id)
70{
71 cudaMemLocation mem_location;
72 mem_location.type = cudaMemLocationTypeDevice;
73 mem_location.id = device_id;
74 if (device_id == cudaCpuDeviceId)
75 mem_location.type = cudaMemLocationTypeHost;
76 else {
77 mem_location.type = cudaMemLocationTypeDevice;
78 mem_location.id = device_id;
79 }
80 return mem_location;
81}
82#else
83inline int
84_getMemoryLocation(int device_id)
85{
86 return device_id;
87}
88#endif
89
90/*---------------------------------------------------------------------------*/
91/*---------------------------------------------------------------------------*/
92
94{
95 public:
96
97 virtual ~ConcreteAllocator() = default;
98
99 public:
100
101 virtual cudaError_t _allocate(void** ptr, size_t new_size) = 0;
102 virtual cudaError_t _deallocate(void* ptr) = 0;
103};
104
105/*---------------------------------------------------------------------------*/
106/*---------------------------------------------------------------------------*/
107
108template <typename ConcreteAllocatorType>
109class UnderlyingAllocator
111{
112 public:
113
114 UnderlyingAllocator() = default;
115
116 public:
117
118 void* allocateMemory(Int64 size) final
119 {
120 void* out = nullptr;
121 ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));
122 return out;
123 }
124 void freeMemory(void* ptr, [[maybe_unused]] Int64 size) final
125 {
126 ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));
127 }
128
129 void doMemoryCopy(void* destination, const void* source, Int64 size) final
130 {
131 ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));
132 }
133
134 eMemoryResource memoryResource() const final
135 {
136 return m_concrete_allocator.memoryResource();
137 }
138
139 public:
140
141 ConcreteAllocatorType m_concrete_allocator;
142};
143
144/*---------------------------------------------------------------------------*/
145/*---------------------------------------------------------------------------*/
146
147class UnifiedMemoryConcreteAllocator
148: public ConcreteAllocator
149{
150 public:
151
152 UnifiedMemoryConcreteAllocator()
153 {
154 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
155 m_use_ats = v.value();
156 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MEMORY_HINT_ON_DEVICE", true))
157 m_use_hint_as_mainly_device = (v.value() != 0);
158 }
159
160 cudaError_t _deallocate(void* ptr) final
161 {
162 if (m_use_ats) {
163 ::free(ptr);
164 return cudaSuccess;
165 }
166 //std::cout << "CUDA_MANAGED_FREE ptr=" << ptr << "\n";
167 return ::cudaFree(ptr);
168 }
169
170 cudaError_t _allocate(void** ptr, size_t new_size) final
171 {
172 if (m_use_ats) {
173 *ptr = ::aligned_alloc(128, new_size);
174 }
175 else {
176 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
177 //std::cout << "CUDA_MANAGED_MALLOC ptr=" << (*ptr) << " size=" << new_size << "\n";
178 //if (new_size < 4000)
179 //std::cout << "STACK=" << platform::getStackTrace() << "\n";
180
181 if (r != cudaSuccess)
182 return r;
183
184 // If requested, indicates that we prefer to allocate on the GPU.
185 // NOTE: In this case, we retrieve the current device to position the
186 // preferred location. If we use MemoryPool, this allocation will only
187 // be performed once. If the default device for a thread changes during
188 // computation, there will be an inconsistency. To avoid this, we could
189 // call cudaMemAdvise() for each allocation (via _applyHint()) but these
190 // operations are quite costly and if there are many allocations, a
191 // performance loss may result.
193 int device_id = 0;
194 void* p = *ptr;
195 cudaGetDevice(&device_id);
196 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));
197 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));
198 }
199 }
200
201 return cudaSuccess;
202 }
203
204 constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }
205
206 public:
207
208 bool m_use_ats = false;
212};
213
214/*---------------------------------------------------------------------------*/
215/*---------------------------------------------------------------------------*/
216
224class UnifiedMemoryCudaMemoryAllocator
225: public AcceleratorMemoryAllocatorBase
226{
227 public:
228 public:
229
230 UnifiedMemoryCudaMemoryAllocator()
231 : AcceleratorMemoryAllocatorBase("UnifiedMemoryCudaMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())
232 {
233 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MALLOC_TRACE", true))
234 _setTraceLevel(v.value());
235 }
236
237 void initialize()
238 {
239 _doInitializeUVM(true);
240 }
241
242 public:
243
244 void notifyMemoryArgsChanged([[maybe_unused]] MemoryAllocationArgs old_args,
245 MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
246 {
247 void* p = ptr.baseAddress();
248 Int64 s = ptr.capacity();
249 if (p && s > 0)
250 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
251 }
252
253 protected:
254
255 void _applyHint(void* p, size_t new_size, MemoryAllocationArgs args)
256 {
257 eMemoryLocationHint hint = args.memoryLocationHint();
258 // Uses the active device to position the GPU by default
259 // We only do this if the hint requires it to avoid calling
260 // cudaGetDevice() every time.
261 int device_id = 0;
263 cudaGetDevice(&device_id);
264 }
265 auto device_memory_location = _getMemoryLocation(device_id);
266 auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);
267
268 //std::cout << "SET_MEMORY_HINT name=" << args.arrayName() << " size=" << new_size << " hint=" << (int)hint << "\n";
270 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));
271 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));
272 }
274 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));
275 //ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, 0));
276 }
278 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));
279 }
280 }
281 void _removeHint(void* p, size_t size, MemoryAllocationArgs args)
282 {
283 eMemoryLocationHint hint = args.memoryLocationHint();
284 if (hint == eMemoryLocationHint::None)
285 return;
286 int device_id = 0;
287 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));
288 }
289
290 private:
291
292 bool m_use_ats = false;
293};
294
295/*---------------------------------------------------------------------------*/
296/*---------------------------------------------------------------------------*/
297
299: public ConcreteAllocator
300{
301 public:
302
303 cudaError_t _allocate(void** ptr, size_t new_size) final
304 {
305 return ::cudaMallocHost(ptr, new_size);
306 }
307 cudaError_t _deallocate(void* ptr) final
308 {
309 return ::cudaFreeHost(ptr);
310 }
311 constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }
312};
313
314/*---------------------------------------------------------------------------*/
315/*---------------------------------------------------------------------------*/
316
317class HostPinnedCudaMemoryAllocator
318: public AcceleratorMemoryAllocatorBase
319{
320 public:
321 public:
322
323 HostPinnedCudaMemoryAllocator()
324 : AcceleratorMemoryAllocatorBase("HostPinnedCudaMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())
325 {
326 }
327
328 public:
329
330 void initialize()
331 {
333 }
334};
335
336/*---------------------------------------------------------------------------*/
337/*---------------------------------------------------------------------------*/
338
339class DeviceConcreteAllocator
340: public ConcreteAllocator
341{
342 public:
343
344 DeviceConcreteAllocator()
345 {
346 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
347 m_use_ats = v.value();
348 }
349
350 cudaError_t _allocate(void** ptr, size_t new_size) final
351 {
352 if (m_use_ats) {
353 // FIXME: it does not work on WIN32
354 *ptr = std::aligned_alloc(128, new_size);
355 if (*ptr)
356 return cudaSuccess;
357 return cudaErrorMemoryAllocation;
358 }
359 cudaError_t r = ::cudaMalloc(ptr, new_size);
360 //std::cout << "ALLOCATE_DEVICE ptr=" << (*ptr) << " size=" << new_size << " r=" << (int)r << "\n";
361 return r;
362 }
363 cudaError_t _deallocate(void* ptr) final
364 {
365 if (m_use_ats) {
366 std::free(ptr);
367 return cudaSuccess;
368 }
369 //std::cout << "FREE_DEVICE ptr=" << ptr << "\n";
370 return ::cudaFree(ptr);
371 }
372
373 constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }
374
375 private:
376
377 bool m_use_ats = false;
378};
379
380/*---------------------------------------------------------------------------*/
381/*---------------------------------------------------------------------------*/
382
383class DeviceCudaMemoryAllocator
384: public AcceleratorMemoryAllocatorBase
385{
386
387 public:
388
389 DeviceCudaMemoryAllocator()
390 : AcceleratorMemoryAllocatorBase("DeviceCudaMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())
391 {
392 }
393
394 public:
395
396 void initialize()
397 {
399 }
400};
401
402/*---------------------------------------------------------------------------*/
403/*---------------------------------------------------------------------------*/
404
405namespace
406{
407 UnifiedMemoryCudaMemoryAllocator unified_memory_cuda_memory_allocator;
408 HostPinnedCudaMemoryAllocator host_pinned_cuda_memory_allocator;
409 DeviceCudaMemoryAllocator device_cuda_memory_allocator;
410} // namespace
411
412/*---------------------------------------------------------------------------*/
413/*---------------------------------------------------------------------------*/
414
415void initializeCudaMemoryAllocators()
416{
417 unified_memory_cuda_memory_allocator.initialize();
418 device_cuda_memory_allocator.initialize();
419 host_pinned_cuda_memory_allocator.initialize();
420}
421
422void finalizeCudaMemoryAllocators(ITraceMng* tm)
423{
424 unified_memory_cuda_memory_allocator.finalize(tm);
425 device_cuda_memory_allocator.finalize(tm);
426 host_pinned_cuda_memory_allocator.finalize(tm);
427}
428
429/*---------------------------------------------------------------------------*/
430/*---------------------------------------------------------------------------*/
431
432void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
433{
434 if (e == CUDA_SUCCESS)
435 return;
436 const char* error_name = nullptr;
437 CUresult e2 = cuGetErrorName(e, &error_name);
438 if (e2 != CUDA_SUCCESS)
439 error_name = "Unknown";
440
441 const char* error_message = nullptr;
442 CUresult e3 = cuGetErrorString(e, &error_message);
443 if (e3 != CUDA_SUCCESS)
444 error_message = "Unknown";
445
446 ARCCORE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
447 ti, e, error_name, error_message);
448}
449
450/*---------------------------------------------------------------------------*/
451/*---------------------------------------------------------------------------*/
452
462{
463 public:
464
465 Int32 getNbThreadPerBlock(const void* kernel_ptr)
466 {
467 std::scoped_lock lock(m_mutex);
468 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
469 if (x != m_nb_thread_per_block_map.end())
470 return x->second;
471 int min_grid_size = 0;
472 int computed_block_size = 0;
473 int wanted_shared_memory = 0;
474 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
475 if (r != cudaSuccess)
476 computed_block_size = 0;
477 int num_block_0 = 0;
478 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
479 int num_block_1 = 0;
480 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
481
482 cudaFuncAttributes func_attr;
483 cudaFuncGetAttributes(&func_attr, kernel_ptr);
484 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
485 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
486 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs;
487
488#if CUDART_VERSION >= 12040
489 // cudaFuncGetName is only available in 12.4
490 const char* func_name = nullptr;
491 cudaFuncGetName(&func_name, kernel_ptr);
492 std::cout << " name=" << func_name << "\n";
493#endif
494
495 return computed_block_size;
496 }
497
498 private:
499
500 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
501 std::mutex m_mutex;
502};
503
504/*---------------------------------------------------------------------------*/
505/*---------------------------------------------------------------------------*/
506
507class CudaRunQueueStream
509{
510 public:
511
512 CudaRunQueueStream(Impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
513 : m_runtime(runtime)
514 {
515 if (bi.isDefault())
516 ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
517 else {
518 int priority = bi.priority();
519 ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
520 }
521 }
522 ~CudaRunQueueStream() override
523 {
524 ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
525 }
526
527 public:
528
529 void notifyBeginLaunchKernel([[maybe_unused]] Impl::RunCommandImpl& c) override
530 {
531#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
532 auto kname = c.kernelName();
533 if (kname.empty())
534 nvtxRangePush(c.traceInfo().name());
535 else
536 nvtxRangePush(kname.localstr());
537#endif
538 return m_runtime->notifyBeginLaunchKernel();
539 }
541 {
542#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
543 nvtxRangePop();
544#endif
545 return m_runtime->notifyEndLaunchKernel();
546 }
547 void barrier() override
548 {
549 ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
550 if (global_cupti_flush > 0)
551 global_cupti_info.flush();
552 }
553 bool _barrierNoException() override
554 {
555 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
556 }
557 void copyMemory(const MemoryCopyArgs& args) override
558 {
559 auto source_bytes = args.source().bytes();
560 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
561 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
562 ARCCORE_CHECK_CUDA(r);
563 if (!args.isAsync())
564 barrier();
565 }
566 void prefetchMemory(const MemoryPrefetchArgs& args) override
567 {
568 auto src = args.source().bytes();
569 if (src.size() == 0)
570 return;
571 DeviceId d = args.deviceId();
572 int device = cudaCpuDeviceId;
573 if (!d.isHost())
574 device = d.asInt32();
575 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
576 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
577 auto mem_location = _getMemoryLocation(device);
578#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
579 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
580#else
581 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
582#endif
583 ARCCORE_CHECK_CUDA(r);
584 if (!args.isAsync())
585 barrier();
586 }
588 {
589 return Impl::NativeStream(&m_cuda_stream);
590 }
591
592 public:
593
594 cudaStream_t trueStream() const
595 {
596 return m_cuda_stream;
597 }
598
599 private:
600
601 Impl::IRunnerRuntime* m_runtime = nullptr;
602 cudaStream_t m_cuda_stream = nullptr;
603};
604
605/*---------------------------------------------------------------------------*/
606/*---------------------------------------------------------------------------*/
607
608class CudaRunQueueEvent
610{
611 public:
612
613 explicit CudaRunQueueEvent(bool has_timer)
614 {
615 if (has_timer)
616 ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
617 else
618 ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
619 }
620 ~CudaRunQueueEvent() override
621 {
622 ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
623 }
624
625 public:
626
627 // Register the event within a RunQueue
628 void recordQueue(Impl::IRunQueueStream* stream) final
629 {
630 auto* rq = static_cast<CudaRunQueueStream*>(stream);
631 ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
632 }
633
634 void wait() final
635 {
636 ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
637 }
638
639 void waitForEvent(Impl::IRunQueueStream* stream) final
640 {
641 auto* rq = static_cast<CudaRunQueueStream*>(stream);
642 ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
643 }
644
645 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
646 {
647 // NOTE: Events must have been created with the timer active
648 ARCCORE_CHECK_POINTER(start_event);
649 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
650 float time_in_ms = 0.0;
651
652 // TODO: check if necessary
653 // ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
654
655 ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
656 double x = time_in_ms * 1.0e6;
657 Int64 nano_time = static_cast<Int64>(x);
658 return nano_time;
659 }
660
661 bool hasPendingWork() final
662 {
663 cudaError_t v = cudaEventQuery(m_cuda_event);
664 if (v == cudaErrorNotReady)
665 return true;
666 ARCCORE_CHECK_CUDA(v);
667 return false;
668 }
669
670 private:
671
672 cudaEvent_t m_cuda_event;
673};
674
675/*---------------------------------------------------------------------------*/
676/*---------------------------------------------------------------------------*/
677
680{
681 public:
682
683 ~CudaRunnerRuntime() override = default;
684
685 public:
686
687 void notifyBeginLaunchKernel() override
688 {
689 ++m_nb_kernel_launched;
690 if (m_is_verbose)
691 std::cout << "BEGIN CUDA KERNEL!\n";
692 }
693 void notifyEndLaunchKernel() override
694 {
695 ARCCORE_CHECK_CUDA(cudaGetLastError());
696 if (m_is_verbose)
697 std::cout << "END CUDA KERNEL!\n";
698 }
699 void barrier() override
700 {
701 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
702 }
703 eExecutionPolicy executionPolicy() const override
704 {
706 }
707 Impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
708 {
709 return new CudaRunQueueStream(this, bi);
710 }
711 Impl::IRunQueueEventImpl* createEventImpl() override
712 {
713 return new CudaRunQueueEvent(false);
714 }
715 Impl::IRunQueueEventImpl* createEventImplWithTimer() override
716 {
717 return new CudaRunQueueEvent(true);
718 }
719 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
720 {
721 auto v = buffer.bytes();
722 const void* ptr = v.data();
723 size_t count = v.size();
724 int device = device_id.asInt32();
725 cudaMemoryAdvise cuda_advise;
726
727 if (advice == eMemoryAdvice::MostlyRead)
728 cuda_advise = cudaMemAdviseSetReadMostly;
730 cuda_advise = cudaMemAdviseSetPreferredLocation;
731 else if (advice == eMemoryAdvice::AccessedByDevice)
732 cuda_advise = cudaMemAdviseSetAccessedBy;
733 else if (advice == eMemoryAdvice::PreferredLocationHost) {
734 cuda_advise = cudaMemAdviseSetPreferredLocation;
735 device = cudaCpuDeviceId;
736 }
737 else if (advice == eMemoryAdvice::AccessedByHost) {
738 cuda_advise = cudaMemAdviseSetAccessedBy;
739 device = cudaCpuDeviceId;
740 }
741 else
742 return;
743 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
744 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
745 }
746 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
747 {
748 auto v = buffer.bytes();
749 const void* ptr = v.data();
750 size_t count = v.size();
751 int device = device_id.asInt32();
752 cudaMemoryAdvise cuda_advise;
753
754 if (advice == eMemoryAdvice::MostlyRead)
755 cuda_advise = cudaMemAdviseUnsetReadMostly;
757 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
758 else if (advice == eMemoryAdvice::AccessedByDevice)
759 cuda_advise = cudaMemAdviseUnsetAccessedBy;
760 else if (advice == eMemoryAdvice::PreferredLocationHost) {
761 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
762 device = cudaCpuDeviceId;
763 }
764 else if (advice == eMemoryAdvice::AccessedByHost) {
765 cuda_advise = cudaMemAdviseUnsetAccessedBy;
766 device = cudaCpuDeviceId;
767 }
768 else
769 return;
770 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
771 }
772
773 void setCurrentDevice(DeviceId device_id) final
774 {
775 Int32 id = device_id.asInt32();
776 if (!device_id.isAccelerator())
777 ARCCORE_FATAL("Device {0} is not an accelerator device", id);
778 ARCCORE_CHECK_CUDA(cudaSetDevice(id));
779 }
780
781 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
782
783 void startProfiling() override
784 {
785 global_cupti_info.start();
786 }
787
788 void stopProfiling() override
789 {
790 global_cupti_info.stop();
791 }
792
793 bool isProfilingActive() override
794 {
795 return global_cupti_info.isActive();
796 }
797
798 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
799 {
800 cudaPointerAttributes ca;
801 ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
802 // NOTE: the Arcane type 'ePointerMemoryType' normally has the same values
803 // as the corresponding CUDA type, so a simple cast can be done.
804 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
805 _fillPointerAttribute(attribute, mem_type, ca.device,
806 ptr, ca.devicePointer, ca.hostPointer);
807 }
808
809 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
810 {
811 int d = 0;
812 int wanted_d = device_id.asInt32();
813 ARCCORE_CHECK_CUDA(cudaGetDevice(&d));
814 if (d != wanted_d)
815 ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));
816 size_t free_mem = 0;
817 size_t total_mem = 0;
818 ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
819 if (d != wanted_d)
820 ARCCORE_CHECK_CUDA(cudaSetDevice(d));
822 dmi.setFreeMemory(free_mem);
823 dmi.setTotalMemory(total_mem);
824 return dmi;
825 }
826
827 void pushProfilerRange(const String& name, Int32 color_rgb) override
828 {
829#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
830 if (color_rgb >= 0) {
831 // NOTE: It would be necessary to do: nvtxEventAttributes_t eventAttrib = { 0 };
832 // but this causes many 'missing initializer for member' warnings
833 nvtxEventAttributes_t eventAttrib;
834 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
835 eventAttrib.version = NVTX_VERSION;
836 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
837 eventAttrib.colorType = NVTX_COLOR_ARGB;
838 eventAttrib.color = color_rgb;
839 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
840 eventAttrib.message.ascii = name.localstr();
841 nvtxRangePushEx(&eventAttrib);
842 }
843 else
844 nvtxRangePush(name.localstr());
845#endif
846 }
847 void popProfilerRange() override
848 {
849#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
850 nvtxRangePop();
851#endif
852 }
853
854 void finalize(ITraceMng* tm) override
855 {
856 finalizeCudaMemoryAllocators(tm);
857 }
858
859 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
860 const void* kernel_ptr,
861 Int64 total_loop_size) override
862 {
863 Int32 shared_memory = orig_args.sharedMemorySize();
864 if (orig_args.isCooperative()) {
865 // In cooperative mode, ensure that we do not launch more blocks
866 // than the maximum that can reside on the GPU.
867 Int32 nb_thread = orig_args.nbThreadPerBlock();
868 Int32 nb_block = orig_args.nbBlockPerGrid();
869 int nb_block_per_sm = 0;
870 ARCCORE_CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
871
872 int max_block = static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);
873 max_block = std::max(max_block, 1);
874 if (nb_block > max_block) {
875 KernelLaunchArgs modified_args(orig_args);
876 modified_args.setNbBlockPerGrid(max_block);
877 return modified_args;
878 }
879 return orig_args;
880 }
881
882 if (!m_use_computed_occupancy)
883 return orig_args;
884 if (shared_memory < 0)
885 shared_memory = 0;
886 // For now, we do not perform calculation if shared memory is non-zero.
887 if (shared_memory != 0)
888 return orig_args;
889 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
890 if (computed_block_size == 0)
891 return orig_args;
892
893 // Here, we use the number of threads per block to achieve a
894 // maximum occupancy.
895 KernelLaunchArgs modified_args(orig_args);
896 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
897 int blocks_per_grid = CheckedConvert::toInt32(big_b);
898 modified_args.setNbBlockPerGrid(blocks_per_grid);
899 modified_args.setNbThreadPerBlock(computed_block_size);
900 return modified_args;
901 }
902
903 public:
904
905 void fillDevices(bool is_verbose);
906 void build()
907 {
908 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
909 m_use_computed_occupancy = v.value();
910 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_COOPERATIVE_RATIO", true)) {
911 Int32 x = v.value();
912 x = std::clamp(x, 10, 100);
913 m_cooperative_ratio = x / 100.0;
914 }
915 }
916
917 private:
918
919 Int64 m_nb_kernel_launched = 0;
920 bool m_is_verbose = false;
921 bool m_use_computed_occupancy = false;
922 Int32 m_multi_processor_count = 0;
923 double m_cooperative_ratio = 1.0;
924 Impl::DeviceInfoList m_device_info_list;
925 OccupancyMap m_occupancy_map;
926};
927
928/*---------------------------------------------------------------------------*/
929/*---------------------------------------------------------------------------*/
930
931void CudaRunnerRuntime::
932fillDevices(bool is_verbose)
933{
934 int nb_device = 0;
935 ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
936 std::ostream& omain = std::cout;
937 if (is_verbose)
938 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
939 for (int i = 0; i < nb_device; ++i) {
940 cudaDeviceProp dp;
941 cudaGetDeviceProperties(&dp, i);
942 int runtime_version = 0;
943 cudaRuntimeGetVersion(&runtime_version);
944 int driver_version = 0;
945 cudaDriverGetVersion(&driver_version);
946 std::ostringstream ostr;
947 std::ostream& o = ostr;
948 o << "Device " << i << " name=" << dp.name << "\n";
949 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
950 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
951 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
952 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
953 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
954 o << " sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor << "\n";
955 o << " sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin << "\n";
956 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
957 o << " warpSize = " << dp.warpSize << "\n";
958 o << " memPitch = " << dp.memPitch << "\n";
959 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
960 o << " maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor << "\n";
961 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
962 o << " totalConstMem = " << dp.totalConstMem << "\n";
963 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
964 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
965 o << " integrated = " << dp.integrated << "\n";
966 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
967 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
968 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
969 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
970 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
971 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
972 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
973 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
974 << " " << dp.maxThreadsDim[2] << "\n";
975 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
976 << " " << dp.maxGridSize[2] << "\n";
977 o << " pciInfo = " << dp.pciDomainID << " " << dp.pciBusID << " " << dp.pciDeviceID << "\n";
978 o << " memoryBusWitdh = " << dp.memoryBusWidth << " bits\n";
979
980 int clock_rate = 0;
981 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, i));
982 o << " clockRate = " << (clock_rate / 1000) << " MHz\n";
983
984 int memory_clock_rate = 0;
985 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&memory_clock_rate, cudaDevAttrMemoryClockRate, i));
986 o << " memoryClockRate = " << (memory_clock_rate / 1000) << " MHz\n";
987
988 Real memory_bandwith = ((dp.memoryBusWidth * memory_clock_rate * 2.0) / 8.0) / 1.0e6;
989 o << " MemoryBandwith = " << memory_bandwith << " GB/s\n";
990
991#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)
992 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
993 o << " computeMode = " << dp.computeMode << "\n";
994 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
995#endif
996
997 // TODO: We assume that all GPUs are the same and therefore
998 // that the number of SM per GPU is the same. This is used to
999 // calculate the number of blocks in cooperative mode.
1000 m_multi_processor_count = dp.multiProcessorCount;
1001
1002 {
1003 int least_val = 0;
1004 int greatest_val = 0;
1005 ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
1006 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
1007 }
1008 std::ostringstream device_uuid_ostr;
1009 {
1010 CUdevice device;
1011 ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));
1012 CUuuid device_uuid;
1013 ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
1014 o << " deviceUuid=";
1015 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
1016 o << device_uuid_ostr.str();
1017 o << "\n";
1018 }
1019 String description(ostr.str());
1020 if (is_verbose)
1021 omain << description;
1022
1023 DeviceInfo device_info;
1024 device_info.setDescription(description);
1025 device_info.setDeviceId(DeviceId(i));
1026 device_info.setName(dp.name);
1027 device_info.setWarpSize(dp.warpSize);
1028 device_info.setUUIDAsString(device_uuid_ostr.str());
1029 device_info.setSharedMemoryPerBlock(static_cast<Int32>(dp.sharedMemPerBlock));
1030 device_info.setSharedMemoryPerMultiprocessor(static_cast<Int32>(dp.sharedMemPerMultiprocessor));
1031 device_info.setSharedMemoryPerBlockOptin(static_cast<Int32>(dp.sharedMemPerBlockOptin));
1032 device_info.setTotalConstMemory(static_cast<Int32>(dp.totalConstMem));
1033 device_info.setPCIDomainID(dp.pciDomainID);
1034 device_info.setPCIBusID(dp.pciBusID);
1035 device_info.setPCIDeviceID(dp.pciDeviceID);
1036 m_device_info_list.addDevice(device_info);
1037 }
1038
1039 Int32 global_cupti_level = 0;
1040
1041 // Check if Cupti is active
1042 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
1043 global_cupti_level = v.value();
1044 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
1045 global_cupti_flush = v.value();
1046 bool do_print_cupti = true;
1047 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
1048 do_print_cupti = (v.value() != 0);
1049
1050 if (global_cupti_level > 0) {
1051#ifndef ARCCORE_HAS_CUDA_CUPTI
1052 ARCCORE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
1053#endif
1054 global_cupti_info.init(global_cupti_level, do_print_cupti);
1055 global_cupti_info.start();
1056 }
1057}
1058
1059/*---------------------------------------------------------------------------*/
1060/*---------------------------------------------------------------------------*/
1061
1063: public IMemoryCopier
1064{
1065 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,
1066 MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,
1067 const RunQueue* queue) override
1068 {
1069 if (queue) {
1070 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
1071 return;
1072 }
1073 // 'cudaMemcpyDefault' automatically knows what to do by only considering
1074 // the pointer values. We should see if using \a from_mem and \a to_mem
1075 // can improve performance.
1076 ARCCORE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
1077 }
1078};
1079
1080/*---------------------------------------------------------------------------*/
1081/*---------------------------------------------------------------------------*/
1082
1083} // End namespace Arcane::Accelerator::Cuda
1084
1085using namespace Arcane;
1086
1087namespace
1088{
1089Accelerator::Cuda::CudaRunnerRuntime global_cuda_runtime;
1090Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
1091
1092void _setAllocator(Accelerator::AcceleratorMemoryAllocatorBase* allocator)
1093{
1095 eMemoryResource mem = allocator->memoryResource();
1096 mrm->setAllocator(mem, allocator);
1097 mrm->setMemoryPool(mem, allocator->memoryPool());
1098}
1099
1100} // namespace
1101
1102/*---------------------------------------------------------------------------*/
1103/*---------------------------------------------------------------------------*/
1104
1105// This function is the entry point used when dynamically loading
1106// this library
1107extern "C" ARCCORE_EXPORT void
1108arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
1109{
1110 using namespace Arcane::Accelerator::Cuda;
1111 global_cuda_runtime.build();
1112 Accelerator::Impl::setUsingCUDARuntime(true);
1113 Accelerator::Impl::setCUDARunQueueRuntime(&global_cuda_runtime);
1114 initializeCudaMemoryAllocators();
1116 MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_cuda_memory_allocator);
1117 IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();
1118 mrm->setIsAccelerator(true);
1119 _setAllocator(&unified_memory_cuda_memory_allocator);
1120 _setAllocator(&host_pinned_cuda_memory_allocator);
1121 _setAllocator(&device_cuda_memory_allocator);
1122 mrm->setCopier(&global_cuda_memory_copier);
1123 global_cuda_runtime.fillDevices(init_info.isVerbose());
1124}
1125
1126/*---------------------------------------------------------------------------*/
1127/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
Base class of a specific allocator for accelerator.
eMemoryResource memoryResource() const final
Memory resource provided by the allocator.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialization for Device memory.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialization for HostPinned memory.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialization for UVM memory.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copies the data from from to to with the queue queue.
void barrier() override
Blocks until all actions associated with this queue are finished.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification before command launch.
bool _barrierNoException() override
Barrier without exception. Returns true in case of error.
Impl::NativeStream nativeStream() override
Pointer to the internal structure dependent on the implementation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Performs a prefetch of a memory region.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification of command launch completion.
void copyMemory(const MemoryCopyArgs &args) override
Performs a copy between two memory regions.
Singleton class to manage CUPTI.
Definition Cupti.h:39
Map containing the ideal occupancy for a given kernel.
void * allocateMemory(Int64 size) final
Allocates a block for size bytes.
void freeMemory(void *ptr, Int64 size) final
Frees the block located at address address containing size bytes.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifies of a change in instance-specific arguments.
bool isHost() const
Indicates if the instance is associated with the host.
bool isAccelerator() const
Indicates if the instance is associated with an accelerator.
Interface for event implementation.
Interface of an execution stream for a RunQueue.
Interface of the runtime associated with an accelerator.
bool isCooperative() const
Indicates if running in cooperative mode (i.e. cudaLaunchCooperativeKernel).
bool isDefault() const
Indicates if the instance only has default values.
bool isAsync() const
Indicates if the execution queue is asynchronous.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copies information between two memory regions.
Definition RunQueue.cc:237
Information about an allocated memory region.
Constant view on a contiguous memory region containing fixed-size elements.
constexpr SpanType bytes() const
View in byte form.
constexpr const std::byte * data() const
Pointer to the memory region.
Template class for converting a type.
Interface for memory copies with accelerator support.
Internal part of Arcane's 'IMemoryResourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Sets the allocator for resource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Sets the memory pool for resource r.
virtual void setIsAccelerator(bool v)=0
Indicates if an accelerator is available.
virtual void setCopier(IMemoryCopier *copier)=0
Sets the copying instance.
virtual IMemoryResourceMngInternal * _internal()=0
Internal interface.
Class containing information to specialize allocations.
Mutable view on a contiguous memory region containing fixed-size elements.
constexpr std::byte * data() const
Pointer to the memory region.
constexpr SpanType bytes() const
View in byte form.
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
Definition Span.h:539
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Definition Span.h:327
const char * localstr() const
Returns the conversion of the instance into UTF-8 encoding.
Definition String.cc:229
@ AccessedByHost
Indicates that the memory region is accessed by the host.
@ PreferredLocationDevice
Prefers memory placement on the accelerator.
@ MostlyRead
Indicates that the memory region is primarily read-only.
@ AccessedByDevice
Indicates that the memory region is accessed by the device.
ePointerMemoryType
Memory type for a pointer.
eExecutionPolicy
Execution policy for a Runner.
@ CUDA
Execution policy using the CUDA environment.
IMemoryRessourceMng * getDataMemoryResourceMng()
Memory resource manager for data.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Sets the specific allocator for accelerators.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Sets the memory resource used for the data memory allocator.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
eMemoryLocationHint
Indices for expected memory location.
@ MainlyHost
Indicates that the data will primarily be used on the CPU.
@ HostAndDeviceMostlyRead
Indicates that the data will be used both on the accelerator and on the CPU and will not be frequentl...
@ MainlyDevice
Indicates that the data will primarily be used on the accelerator.
double Real
Type representing a real number.
eMemoryResource
List of available memory resources.
@ HostPinned
Allocates on the host.
@ UnifiedMemory
Allocates using unified memory.
@ Device
Allocates on the device.
std::int32_t Int32
Signed integer type of 32 bits.