Arcane  v4.1.7.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2026 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/CudaAccelerator.h"
15
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
18
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
35
36#include "arccore/accelerator_native/runtime/Cupti.h"
37
38#include <sstream>
39#include <unordered_map>
40#include <mutex>
41#include <algorithm>
42
43#include <cuda.h>
44
45// Pour std::memset
46#include <cstring>
47
48#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
49#include <nvtx3/nvToolsExt.h>
50#endif
51
52namespace Arcane::Accelerator::Cuda
53{
54using Impl::KernelLaunchArgs;
55
56namespace
57{
58 Int32 global_cupti_flush = 0;
59 CuptiInfo global_cupti_info;
60} // namespace
61
62/*---------------------------------------------------------------------------*/
63/*---------------------------------------------------------------------------*/
64
65// A partir de CUDA 13, il y a un nouveau type cudaMemLocation
66// pour les méthodes telles cudeMemAdvise ou cudaMemPrefetch
67#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
68inline cudaMemLocation
69_getMemoryLocation(int device_id)
70{
71 cudaMemLocation mem_location;
72 mem_location.type = cudaMemLocationTypeDevice;
73 mem_location.id = device_id;
74 if (device_id == cudaCpuDeviceId)
75 mem_location.type = cudaMemLocationTypeHost;
76 else {
77 mem_location.type = cudaMemLocationTypeDevice;
78 mem_location.id = device_id;
79 }
80 return mem_location;
81}
82#else
83inline int
84_getMemoryLocation(int device_id)
85{
86 return device_id;
87}
88#endif
89
90/*---------------------------------------------------------------------------*/
91/*---------------------------------------------------------------------------*/
92
94{
95 public:
96
97 virtual ~ConcreteAllocator() = default;
98
99 public:
100
101 virtual cudaError_t _allocate(void** ptr, size_t new_size) = 0;
102 virtual cudaError_t _deallocate(void* ptr) = 0;
103};
104
105/*---------------------------------------------------------------------------*/
106/*---------------------------------------------------------------------------*/
107
108template <typename ConcreteAllocatorType>
109class UnderlyingAllocator
111{
112 public:
113
114 UnderlyingAllocator() = default;
115
116 public:
117
118 void* allocateMemory(size_t size) final
119 {
120 void* out = nullptr;
121 ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));
122 return out;
123 }
124 void freeMemory(void* ptr, [[maybe_unused]] size_t size) final
125 {
126 ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));
127 }
128
129 void doMemoryCopy(void* destination, const void* source, Int64 size) final
130 {
131 ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));
132 }
133
134 eMemoryResource memoryResource() const final
135 {
136 return m_concrete_allocator.memoryResource();
137 }
138
139 public:
140
141 ConcreteAllocatorType m_concrete_allocator;
142};
143
144/*---------------------------------------------------------------------------*/
145/*---------------------------------------------------------------------------*/
146
147class UnifiedMemoryConcreteAllocator
148: public ConcreteAllocator
149{
150 public:
151
152 UnifiedMemoryConcreteAllocator()
153 {
154 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
155 m_use_ats = v.value();
156 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MEMORY_HINT_ON_DEVICE", true))
157 m_use_hint_as_mainly_device = (v.value() != 0);
158 }
159
160 cudaError_t _deallocate(void* ptr) final
161 {
162 if (m_use_ats) {
163 ::free(ptr);
164 return cudaSuccess;
165 }
166 //std::cout << "CUDA_MANAGED_FREE ptr=" << ptr << "\n";
167 return ::cudaFree(ptr);
168 }
169
170 cudaError_t _allocate(void** ptr, size_t new_size) final
171 {
172 if (m_use_ats) {
173 *ptr = ::aligned_alloc(128, new_size);
174 }
175 else {
176 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
177 //std::cout << "CUDA_MANAGED_MALLOC ptr=" << (*ptr) << " size=" << new_size << "\n";
178 //if (new_size < 4000)
179 //std::cout << "STACK=" << platform::getStackTrace() << "\n";
180
181 if (r != cudaSuccess)
182 return r;
183
184 // Si demandé, indique qu'on préfère allouer sur le GPU.
185 // NOTE: Dans ce cas, on récupère le device actuel pour positionner la localisation
186 // préférée. Dans le cas où on utilise MemoryPool, cette allocation ne sera effectuée
187 // qu'une seule fois. Si le device par défaut pour un thread change au cours du calcul
188 // il y aura une incohérence. Pour éviter cela, on pourrait faire un cudaMemAdvise()
189 // pour chaque allocation (via _applyHint()) mais ces opérations sont assez couteuses
190 // et s'il y a beaucoup d'allocation il peut en résulter une perte de performance.
192 int device_id = 0;
193 void* p = *ptr;
194 cudaGetDevice(&device_id);
195 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));
196 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));
197 }
198 }
199
200 return cudaSuccess;
201 }
202
203 constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }
204
205 public:
206
207 bool m_use_ats = false;
210};
211
212/*---------------------------------------------------------------------------*/
213/*---------------------------------------------------------------------------*/
221class UnifiedMemoryCudaMemoryAllocator
222: public AcceleratorMemoryAllocatorBase
223{
224 public:
225 public:
226
227 UnifiedMemoryCudaMemoryAllocator()
228 : AcceleratorMemoryAllocatorBase("UnifiedMemoryCudaMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())
229 {
230 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MALLOC_TRACE", true))
231 _setTraceLevel(v.value());
232 }
233
234 void initialize()
235 {
236 _doInitializeUVM(true);
237 }
238
239 public:
240
241 void notifyMemoryArgsChanged([[maybe_unused]] MemoryAllocationArgs old_args,
242 MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
243 {
244 void* p = ptr.baseAddress();
245 Int64 s = ptr.capacity();
246 if (p && s > 0)
247 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
248 }
249
250 protected:
251
252 void _applyHint(void* p, size_t new_size, MemoryAllocationArgs args)
253 {
254 eMemoryLocationHint hint = args.memoryLocationHint();
255 // Utilise le device actif pour positionner le GPU par défaut
256 // On ne le fait que si le \a hint le nécessite pour éviter d'appeler
257 // cudaGetDevice() à chaque fois.
258 int device_id = 0;
260 cudaGetDevice(&device_id);
261 }
262 auto device_memory_location = _getMemoryLocation(device_id);
263 auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);
264
265 //std::cout << "SET_MEMORY_HINT name=" << args.arrayName() << " size=" << new_size << " hint=" << (int)hint << "\n";
267 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));
268 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));
269 }
271 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));
272 //ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, 0));
273 }
275 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));
276 }
277 }
278 void _removeHint(void* p, size_t size, MemoryAllocationArgs args)
279 {
280 eMemoryLocationHint hint = args.memoryLocationHint();
281 if (hint == eMemoryLocationHint::None)
282 return;
283 int device_id = 0;
284 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));
285 }
286
287 private:
288
289 bool m_use_ats = false;
290};
291
292/*---------------------------------------------------------------------------*/
293/*---------------------------------------------------------------------------*/
294
296: public ConcreteAllocator
297{
298 public:
299
300 cudaError_t _allocate(void** ptr, size_t new_size) final
301 {
302 return ::cudaMallocHost(ptr, new_size);
303 }
304 cudaError_t _deallocate(void* ptr) final
305 {
306 return ::cudaFreeHost(ptr);
307 }
308 constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }
309};
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
313
314class HostPinnedCudaMemoryAllocator
315: public AcceleratorMemoryAllocatorBase
316{
317 public:
318 public:
319
320 HostPinnedCudaMemoryAllocator()
321 : AcceleratorMemoryAllocatorBase("HostPinnedCudaMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())
322 {
323 }
324
325 public:
326
327 void initialize()
328 {
330 }
331};
332
333/*---------------------------------------------------------------------------*/
334/*---------------------------------------------------------------------------*/
335
336class DeviceConcreteAllocator
337: public ConcreteAllocator
338{
339 public:
340
341 DeviceConcreteAllocator()
342 {
343 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
344 m_use_ats = v.value();
345 }
346
347 cudaError_t _allocate(void** ptr, size_t new_size) final
348 {
349 if (m_use_ats) {
350 // FIXME: it does not work on WIN32
351 *ptr = std::aligned_alloc(128, new_size);
352 if (*ptr)
353 return cudaSuccess;
354 return cudaErrorMemoryAllocation;
355 }
356 cudaError_t r = ::cudaMalloc(ptr, new_size);
357 //std::cout << "ALLOCATE_DEVICE ptr=" << (*ptr) << " size=" << new_size << " r=" << (int)r << "\n";
358 return r;
359 }
360 cudaError_t _deallocate(void* ptr) final
361 {
362 if (m_use_ats) {
363 std::free(ptr);
364 return cudaSuccess;
365 }
366 //std::cout << "FREE_DEVICE ptr=" << ptr << "\n";
367 return ::cudaFree(ptr);
368 }
369
370 constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }
371
372 private:
373
374 bool m_use_ats = false;
375};
376
377/*---------------------------------------------------------------------------*/
378/*---------------------------------------------------------------------------*/
379
380class DeviceCudaMemoryAllocator
381: public AcceleratorMemoryAllocatorBase
382{
383
384 public:
385
386 DeviceCudaMemoryAllocator()
387 : AcceleratorMemoryAllocatorBase("DeviceCudaMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())
388 {
389 }
390
391 public:
392
393 void initialize()
394 {
396 }
397};
398
399/*---------------------------------------------------------------------------*/
400/*---------------------------------------------------------------------------*/
401
402namespace
403{
404 UnifiedMemoryCudaMemoryAllocator unified_memory_cuda_memory_allocator;
405 HostPinnedCudaMemoryAllocator host_pinned_cuda_memory_allocator;
406 DeviceCudaMemoryAllocator device_cuda_memory_allocator;
407} // namespace
408
409/*---------------------------------------------------------------------------*/
410/*---------------------------------------------------------------------------*/
411
412void initializeCudaMemoryAllocators()
413{
414 unified_memory_cuda_memory_allocator.initialize();
415 device_cuda_memory_allocator.initialize();
416 host_pinned_cuda_memory_allocator.initialize();
417}
418
419void finalizeCudaMemoryAllocators(ITraceMng* tm)
420{
421 unified_memory_cuda_memory_allocator.finalize(tm);
422 device_cuda_memory_allocator.finalize(tm);
423 host_pinned_cuda_memory_allocator.finalize(tm);
424}
425
426/*---------------------------------------------------------------------------*/
427/*---------------------------------------------------------------------------*/
428
429void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
430{
431 if (e == CUDA_SUCCESS)
432 return;
433 const char* error_name = nullptr;
434 CUresult e2 = cuGetErrorName(e, &error_name);
435 if (e2 != CUDA_SUCCESS)
436 error_name = "Unknown";
437
438 const char* error_message = nullptr;
439 CUresult e3 = cuGetErrorString(e, &error_message);
440 if (e3 != CUDA_SUCCESS)
441 error_message = "Unknown";
442
443 ARCCORE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
444 ti, e, error_name, error_message);
445}
446
447/*---------------------------------------------------------------------------*/
448/*---------------------------------------------------------------------------*/
458{
459 public:
460
461 Int32 getNbThreadPerBlock(const void* kernel_ptr)
462 {
463 std::scoped_lock lock(m_mutex);
464 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
465 if (x != m_nb_thread_per_block_map.end())
466 return x->second;
467 int min_grid_size = 0;
468 int computed_block_size = 0;
469 int wanted_shared_memory = 0;
470 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
471 if (r != cudaSuccess)
472 computed_block_size = 0;
473 int num_block_0 = 0;
474 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
475 int num_block_1 = 0;
476 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
477
478 cudaFuncAttributes func_attr;
479 cudaFuncGetAttributes(&func_attr, kernel_ptr);
480 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
481 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
482 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs;
483
484#if CUDART_VERSION >= 12040
485 // cudaFuncGetName is only available in 12.4
486 const char* func_name = nullptr;
487 cudaFuncGetName(&func_name, kernel_ptr);
488 std::cout << " name=" << func_name << "\n";
489#endif
490
491 return computed_block_size;
492 }
493
494 private:
495
496 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
497 std::mutex m_mutex;
498};
499
500/*---------------------------------------------------------------------------*/
501/*---------------------------------------------------------------------------*/
502
503class CudaRunQueueStream
505{
506 public:
507
508 CudaRunQueueStream(Impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
509 : m_runtime(runtime)
510 {
511 if (bi.isDefault())
512 ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
513 else {
514 int priority = bi.priority();
515 ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
516 }
517 }
518 ~CudaRunQueueStream() override
519 {
520 ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
521 }
522
523 public:
524
525 void notifyBeginLaunchKernel([[maybe_unused]] Impl::RunCommandImpl& c) override
526 {
527#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
528 auto kname = c.kernelName();
529 if (kname.empty())
530 nvtxRangePush(c.traceInfo().name());
531 else
532 nvtxRangePush(kname.localstr());
533#endif
534 return m_runtime->notifyBeginLaunchKernel();
535 }
537 {
538#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
539 nvtxRangePop();
540#endif
541 return m_runtime->notifyEndLaunchKernel();
542 }
543 void barrier() override
544 {
545 ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
546 if (global_cupti_flush > 0)
547 global_cupti_info.flush();
548 }
549 bool _barrierNoException() override
550 {
551 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
552 }
553 void copyMemory(const MemoryCopyArgs& args) override
554 {
555 auto source_bytes = args.source().bytes();
556 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
557 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
558 ARCCORE_CHECK_CUDA(r);
559 if (!args.isAsync())
560 barrier();
561 }
562 void prefetchMemory(const MemoryPrefetchArgs& args) override
563 {
564 auto src = args.source().bytes();
565 if (src.size() == 0)
566 return;
567 DeviceId d = args.deviceId();
568 int device = cudaCpuDeviceId;
569 if (!d.isHost())
570 device = d.asInt32();
571 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
572 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
573 auto mem_location = _getMemoryLocation(device);
574#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
575 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
576#else
577 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
578#endif
579 ARCCORE_CHECK_CUDA(r);
580 if (!args.isAsync())
581 barrier();
582 }
584 {
585 return Impl::NativeStream(&m_cuda_stream);
586 }
587
588 public:
589
590 cudaStream_t trueStream() const
591 {
592 return m_cuda_stream;
593 }
594
595 private:
596
597 Impl::IRunnerRuntime* m_runtime = nullptr;
598 cudaStream_t m_cuda_stream = nullptr;
599};
600
601/*---------------------------------------------------------------------------*/
602/*---------------------------------------------------------------------------*/
603
604class CudaRunQueueEvent
606{
607 public:
608
609 explicit CudaRunQueueEvent(bool has_timer)
610 {
611 if (has_timer)
612 ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
613 else
614 ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
615 }
616 ~CudaRunQueueEvent() override
617 {
618 ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
619 }
620
621 public:
622
623 // Enregistre l'événement au sein d'une RunQueue
624 void recordQueue(Impl::IRunQueueStream* stream) final
625 {
626 auto* rq = static_cast<CudaRunQueueStream*>(stream);
627 ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
628 }
629
630 void wait() final
631 {
632 ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
633 }
634
635 void waitForEvent(Impl::IRunQueueStream* stream) final
636 {
637 auto* rq = static_cast<CudaRunQueueStream*>(stream);
638 ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
639 }
640
641 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
642 {
643 // NOTE: Les évènements doivent avoir été créé avec le timer actif
644 ARCCORE_CHECK_POINTER(start_event);
645 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
646 float time_in_ms = 0.0;
647
648 // TODO: regarder si nécessaire
649 // ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
650
651 ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
652 double x = time_in_ms * 1.0e6;
653 Int64 nano_time = static_cast<Int64>(x);
654 return nano_time;
655 }
656
657 bool hasPendingWork() final
658 {
659 cudaError_t v = cudaEventQuery(m_cuda_event);
660 if (v == cudaErrorNotReady)
661 return true;
662 ARCCORE_CHECK_CUDA(v);
663 return false;
664 }
665
666 private:
667
668 cudaEvent_t m_cuda_event;
669};
670
671/*---------------------------------------------------------------------------*/
672/*---------------------------------------------------------------------------*/
673
676{
677 public:
678
679 ~CudaRunnerRuntime() override = default;
680
681 public:
682
683 void notifyBeginLaunchKernel() override
684 {
685 ++m_nb_kernel_launched;
686 if (m_is_verbose)
687 std::cout << "BEGIN CUDA KERNEL!\n";
688 }
689 void notifyEndLaunchKernel() override
690 {
691 ARCCORE_CHECK_CUDA(cudaGetLastError());
692 if (m_is_verbose)
693 std::cout << "END CUDA KERNEL!\n";
694 }
695 void barrier() override
696 {
697 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
698 }
699 eExecutionPolicy executionPolicy() const override
700 {
702 }
703 Impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
704 {
705 return new CudaRunQueueStream(this, bi);
706 }
707 Impl::IRunQueueEventImpl* createEventImpl() override
708 {
709 return new CudaRunQueueEvent(false);
710 }
711 Impl::IRunQueueEventImpl* createEventImplWithTimer() override
712 {
713 return new CudaRunQueueEvent(true);
714 }
715 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
716 {
717 auto v = buffer.bytes();
718 const void* ptr = v.data();
719 size_t count = v.size();
720 int device = device_id.asInt32();
721 cudaMemoryAdvise cuda_advise;
722
723 if (advice == eMemoryAdvice::MostlyRead)
724 cuda_advise = cudaMemAdviseSetReadMostly;
726 cuda_advise = cudaMemAdviseSetPreferredLocation;
727 else if (advice == eMemoryAdvice::AccessedByDevice)
728 cuda_advise = cudaMemAdviseSetAccessedBy;
729 else if (advice == eMemoryAdvice::PreferredLocationHost) {
730 cuda_advise = cudaMemAdviseSetPreferredLocation;
731 device = cudaCpuDeviceId;
732 }
733 else if (advice == eMemoryAdvice::AccessedByHost) {
734 cuda_advise = cudaMemAdviseSetAccessedBy;
735 device = cudaCpuDeviceId;
736 }
737 else
738 return;
739 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
740 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
741 }
742 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
743 {
744 auto v = buffer.bytes();
745 const void* ptr = v.data();
746 size_t count = v.size();
747 int device = device_id.asInt32();
748 cudaMemoryAdvise cuda_advise;
749
750 if (advice == eMemoryAdvice::MostlyRead)
751 cuda_advise = cudaMemAdviseUnsetReadMostly;
753 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
754 else if (advice == eMemoryAdvice::AccessedByDevice)
755 cuda_advise = cudaMemAdviseUnsetAccessedBy;
756 else if (advice == eMemoryAdvice::PreferredLocationHost) {
757 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
758 device = cudaCpuDeviceId;
759 }
760 else if (advice == eMemoryAdvice::AccessedByHost) {
761 cuda_advise = cudaMemAdviseUnsetAccessedBy;
762 device = cudaCpuDeviceId;
763 }
764 else
765 return;
766 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
767 }
768
769 void setCurrentDevice(DeviceId device_id) final
770 {
771 Int32 id = device_id.asInt32();
772 if (!device_id.isAccelerator())
773 ARCCORE_FATAL("Device {0} is not an accelerator device", id);
774 ARCCORE_CHECK_CUDA(cudaSetDevice(id));
775 }
776
777 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
778
779 void startProfiling() override
780 {
781 global_cupti_info.start();
782 }
783
784 void stopProfiling() override
785 {
786 global_cupti_info.stop();
787 }
788
789 bool isProfilingActive() override
790 {
791 return global_cupti_info.isActive();
792 }
793
794 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
795 {
796 cudaPointerAttributes ca;
797 ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
798 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
799 // que le type CUDA correspondant donc on peut faire un cast simple.
800 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
801 _fillPointerAttribute(attribute, mem_type, ca.device,
802 ptr, ca.devicePointer, ca.hostPointer);
803 }
804
805 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
806 {
807 int d = 0;
808 int wanted_d = device_id.asInt32();
809 ARCCORE_CHECK_CUDA(cudaGetDevice(&d));
810 if (d != wanted_d)
811 ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));
812 size_t free_mem = 0;
813 size_t total_mem = 0;
814 ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
815 if (d != wanted_d)
816 ARCCORE_CHECK_CUDA(cudaSetDevice(d));
818 dmi.setFreeMemory(free_mem);
819 dmi.setTotalMemory(total_mem);
820 return dmi;
821 }
822
823 void pushProfilerRange(const String& name, Int32 color_rgb) override
824 {
825#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
826 if (color_rgb >= 0) {
827 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
828 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
829 nvtxEventAttributes_t eventAttrib;
830 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
831 eventAttrib.version = NVTX_VERSION;
832 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
833 eventAttrib.colorType = NVTX_COLOR_ARGB;
834 eventAttrib.color = color_rgb;
835 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
836 eventAttrib.message.ascii = name.localstr();
837 nvtxRangePushEx(&eventAttrib);
838 }
839 else
840 nvtxRangePush(name.localstr());
841#endif
842 }
843 void popProfilerRange() override
844 {
845#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
846 nvtxRangePop();
847#endif
848 }
849
850 void finalize(ITraceMng* tm) override
851 {
852 finalizeCudaMemoryAllocators(tm);
853 }
854
855 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
856 const void* kernel_ptr,
857 Int64 total_loop_size) override
858 {
859 Int32 shared_memory = orig_args.sharedMemorySize();
860 if (orig_args.isCooperative()) {
861 // En mode coopératif, s'assure qu'on ne lance pas plus de blocs
862 // que le maximum qui peut résider sur le GPU.
863 Int32 nb_thread = orig_args.nbThreadPerBlock();
864 Int32 nb_block = orig_args.nbBlockPerGrid();
865 int nb_block_per_sm = 0;
866 ARCCORE_CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
867
868 int max_block = static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);
869 max_block = std::max(max_block, 1);
870 if (nb_block > max_block) {
871 KernelLaunchArgs modified_args(orig_args);
872 modified_args.setNbBlockPerGrid(max_block);
873 return modified_args;
874 }
875 return orig_args;
876 }
877
878 if (!m_use_computed_occupancy)
879 return orig_args;
880 if (shared_memory < 0)
881 shared_memory = 0;
882 // Pour l'instant, on ne fait pas de calcul si la mémoire partagée est non nulle.
883 if (shared_memory != 0)
884 return orig_args;
885 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
886 if (computed_block_size == 0)
887 return orig_args;
888
889 // Ici, on utilise le nombre de threads par bloc pour avoir une
890 // occupation maximale.
891 KernelLaunchArgs modified_args(orig_args);
892 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
893 int blocks_per_grid = CheckedConvert::toInt32(big_b);
894 modified_args.setNbBlockPerGrid(blocks_per_grid);
895 modified_args.setNbThreadPerBlock(computed_block_size);
896 return modified_args;
897 }
898
899 public:
900
901 void fillDevices(bool is_verbose);
902 void build()
903 {
904 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
905 m_use_computed_occupancy = v.value();
906 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_COOPERATIVE_RATIO", true)) {
907 Int32 x = v.value();
908 x = std::clamp(x, 10, 100);
909 m_cooperative_ratio = x / 100.0;
910 }
911 }
912
913 private:
914
915 Int64 m_nb_kernel_launched = 0;
916 bool m_is_verbose = false;
917 bool m_use_computed_occupancy = false;
918 Int32 m_multi_processor_count = 0;
919 double m_cooperative_ratio = 1.0;
920 Impl::DeviceInfoList m_device_info_list;
921 OccupancyMap m_occupancy_map;
922};
923
924/*---------------------------------------------------------------------------*/
925/*---------------------------------------------------------------------------*/
926
927void CudaRunnerRuntime::
928fillDevices(bool is_verbose)
929{
930 int nb_device = 0;
931 ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
932 std::ostream& omain = std::cout;
933 if (is_verbose)
934 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
935 for (int i = 0; i < nb_device; ++i) {
936 cudaDeviceProp dp;
937 cudaGetDeviceProperties(&dp, i);
938 int runtime_version = 0;
939 cudaRuntimeGetVersion(&runtime_version);
940 int driver_version = 0;
941 cudaDriverGetVersion(&driver_version);
942 std::ostringstream ostr;
943 std::ostream& o = ostr;
944 o << "Device " << i << " name=" << dp.name << "\n";
945 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
946 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
947 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
948 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
949 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
950 o << " sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor << "\n";
951 o << " sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin << "\n";
952 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
953 o << " warpSize = " << dp.warpSize << "\n";
954 o << " memPitch = " << dp.memPitch << "\n";
955 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
956 o << " maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor << "\n";
957 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
958 o << " totalConstMem = " << dp.totalConstMem << "\n";
959 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
960 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
961 o << " integrated = " << dp.integrated << "\n";
962 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
963 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
964 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
965 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
966 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
967 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
968 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
969 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
970 << " " << dp.maxThreadsDim[2] << "\n";
971 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
972 << " " << dp.maxGridSize[2] << "\n";
973 o << " pciInfo = " << dp.pciDomainID << " " << dp.pciBusID << " " << dp.pciDeviceID << "\n";
974 o << " memoryBusWitdh = " << dp.memoryBusWidth << " bits\n";
975
976 int clock_rate = 0;
977 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, i));
978 o << " clockRate = " << (clock_rate / 1000) << " MHz\n";
979
980 int memory_clock_rate = 0;
981 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&memory_clock_rate, cudaDevAttrMemoryClockRate, i));
982 o << " memoryClockRate = " << (memory_clock_rate / 1000) << " MHz\n";
983
984 Real memory_bandwith = ((dp.memoryBusWidth * memory_clock_rate * 2.0) / 8.0) / 1.0e6;
985 o << " MemoryBandwith = " << memory_bandwith << " GB/s\n";
986
987#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)
988 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
989 o << " computeMode = " << dp.computeMode << "\n";
990 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
991#endif
992
993 // TODO: On suppose que tous les GPUs sont les mêmes et donc
994 // que le nombre de SM par GPU est le même. Cela est utilisé pour
995 // calculer le nombre de blocs en mode coopératif.
996 m_multi_processor_count = dp.multiProcessorCount;
997
998 {
999 int least_val = 0;
1000 int greatest_val = 0;
1001 ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
1002 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
1003 }
1004 std::ostringstream device_uuid_ostr;
1005 {
1006 CUdevice device;
1007 ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));
1008 CUuuid device_uuid;
1009 ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
1010 o << " deviceUuid=";
1011 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
1012 o << device_uuid_ostr.str();
1013 o << "\n";
1014 }
1015 String description(ostr.str());
1016 if (is_verbose)
1017 omain << description;
1018
1019 DeviceInfo device_info;
1020 device_info.setDescription(description);
1021 device_info.setDeviceId(DeviceId(i));
1022 device_info.setName(dp.name);
1023 device_info.setWarpSize(dp.warpSize);
1024 device_info.setUUIDAsString(device_uuid_ostr.str());
1025 device_info.setSharedMemoryPerBlock(static_cast<Int32>(dp.sharedMemPerBlock));
1026 device_info.setSharedMemoryPerMultiprocessor(static_cast<Int32>(dp.sharedMemPerMultiprocessor));
1027 device_info.setSharedMemoryPerBlockOptin(static_cast<Int32>(dp.sharedMemPerBlockOptin));
1028 device_info.setTotalConstMemory(static_cast<Int32>(dp.totalConstMem));
1029 device_info.setPCIDomainID(dp.pciDomainID);
1030 device_info.setPCIBusID(dp.pciBusID);
1031 device_info.setPCIDeviceID(dp.pciDeviceID);
1032 m_device_info_list.addDevice(device_info);
1033 }
1034
1035 Int32 global_cupti_level = 0;
1036
1037 // Regarde si on active Cupti
1038 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
1039 global_cupti_level = v.value();
1040 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
1041 global_cupti_flush = v.value();
1042 bool do_print_cupti = true;
1043 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
1044 do_print_cupti = (v.value() != 0);
1045
1046 if (global_cupti_level > 0) {
1047#ifndef ARCCORE_HAS_CUDA_CUPTI
1048 ARCCORE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
1049#endif
1050 global_cupti_info.init(global_cupti_level, do_print_cupti);
1051 global_cupti_info.start();
1052 }
1053}
1054
1055/*---------------------------------------------------------------------------*/
1056/*---------------------------------------------------------------------------*/
1057
1059: public IMemoryCopier
1060{
1061 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,
1062 MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,
1063 const RunQueue* queue) override
1064 {
1065 if (queue) {
1066 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
1067 return;
1068 }
1069 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
1070 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
1071 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
1072 ARCCORE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
1073 }
1074};
1075
1076/*---------------------------------------------------------------------------*/
1077/*---------------------------------------------------------------------------*/
1078
1079} // End namespace Arcane::Accelerator::Cuda
1080
1081using namespace Arcane;
1082
1083namespace
1084{
1085Accelerator::Cuda::CudaRunnerRuntime global_cuda_runtime;
1086Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
1087
1088void _setAllocator(Accelerator::AcceleratorMemoryAllocatorBase* allocator)
1089{
1091 eMemoryResource mem = allocator->memoryResource();
1092 mrm->setAllocator(mem, allocator);
1093 mrm->setMemoryPool(mem, allocator->memoryPool());
1094}
1095
1096} // namespace
1097
1098/*---------------------------------------------------------------------------*/
1099/*---------------------------------------------------------------------------*/
1100
1101// Cette fonction est le point d'entrée utilisé lors du chargement
1102// dynamique de cette bibliothèque
1103extern "C" ARCCORE_EXPORT void
1104arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
1105{
1106 using namespace Arcane::Accelerator::Cuda;
1107 global_cuda_runtime.build();
1108 Accelerator::Impl::setUsingCUDARuntime(true);
1109 Accelerator::Impl::setCUDARunQueueRuntime(&global_cuda_runtime);
1110 initializeCudaMemoryAllocators();
1112 MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_cuda_memory_allocator);
1113 IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();
1114 mrm->setIsAccelerator(true);
1115 _setAllocator(&unified_memory_cuda_memory_allocator);
1116 _setAllocator(&host_pinned_cuda_memory_allocator);
1117 _setAllocator(&device_cuda_memory_allocator);
1118 mrm->setCopier(&global_cuda_memory_copier);
1119 global_cuda_runtime.fillDevices(init_info.isVerbose());
1120}
1121
1122/*---------------------------------------------------------------------------*/
1123/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro envoyant une exception FatalErrorException.
#define ARCCORE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
Classe de base d'un allocateur spécifique pour accélérateur.
eMemoryResource memoryResource() const final
Ressource mémoire fournie par l'allocateur.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Definition Cupti.h:38
Map contenant l'occupation idéale pour un kernel donné.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
bool m_use_hint_as_mainly_device
Si vrai, par défaut on considère toutes les allocations comme eMemoryLocationHint::MainlyDevice.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifie du changement des arguments spécifiques à l'instance.
bool isHost() const
Indique si l'instance est associée à l'hôte.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
bool isCooperative() const
Indique si on lance en mode coopératif (i.e. cudaLaunchCooperativeKernel)
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Informations sur une zone mémoire allouée.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Positionne le pool mémoire pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Classe contenant des informations pour spécialiser les allocations.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:537
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:325
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryLocationHint
Indices sur la localisation mémoire attendue.
@ MainlyHost
Indique que la donnée sera plutôt utilisée sur CPU.
@ HostAndDeviceMostlyRead
Indique que la donnée sera utilisée à la fois sur accélérateur et sur CPU et qu'elle ne sera pas souv...
@ MainlyDevice
Indique que la donnée sera plutôt utilisée sur accélérateur.
double Real
Type représentant un réel.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.