Arcane  v4.1.2.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2025 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/CudaAccelerator.h"
15
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
18
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
35
36#include "arccore/accelerator_native/runtime/Cupti.h"
37
38#include <sstream>
39#include <unordered_map>
40#include <mutex>
41
42#include <cuda.h>
43
44// Pour std::memset
45#include <cstring>
46
47#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
48#include <nvtx3/nvToolsExt.h>
49#endif
50
51namespace Arcane::Accelerator::Cuda
52{
53using Impl::KernelLaunchArgs;
54
55namespace
56{
57 Int32 global_cupti_flush = 0;
58 CuptiInfo global_cupti_info;
59} // namespace
60
61
62/*---------------------------------------------------------------------------*/
63/*---------------------------------------------------------------------------*/
64
65// A partir de CUDA 13, il y a un nouveau type cudaMemLocation
66// pour les méthodes telles cudeMemAdvise ou cudaMemPrefetch
67#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
68inline cudaMemLocation
69_getMemoryLocation(int device_id)
70{
71 cudaMemLocation mem_location;
72 mem_location.type = cudaMemLocationTypeDevice;
73 mem_location.id = device_id;
74 if (device_id == cudaCpuDeviceId)
75 mem_location.type = cudaMemLocationTypeHost;
76 else {
77 mem_location.type = cudaMemLocationTypeDevice;
78 mem_location.id = device_id;
79 }
80 return mem_location;
81}
82#else
83inline int
84_getMemoryLocation(int device_id)
85{
86 return device_id;
87}
88#endif
89
90/*---------------------------------------------------------------------------*/
91/*---------------------------------------------------------------------------*/
92
94{
95 public:
96
97 virtual ~ConcreteAllocator() = default;
98
99 public:
100
101 virtual cudaError_t _allocate(void** ptr, size_t new_size) = 0;
102 virtual cudaError_t _deallocate(void* ptr) = 0;
103};
104
105/*---------------------------------------------------------------------------*/
106/*---------------------------------------------------------------------------*/
107
108template <typename ConcreteAllocatorType>
109class UnderlyingAllocator
111{
112 public:
113
114 UnderlyingAllocator() = default;
115
116 public:
117
118 void* allocateMemory(size_t size) final
119 {
120 void* out = nullptr;
121 ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));
122 return out;
123 }
124 void freeMemory(void* ptr, [[maybe_unused]] size_t size) final
125 {
126 ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));
127 }
128
129 void doMemoryCopy(void* destination, const void* source, Int64 size) final
130 {
131 ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));
132 }
133
134 eMemoryResource memoryResource() const final
135 {
136 return m_concrete_allocator.memoryResource();
137 }
138
139 public:
140
141 ConcreteAllocatorType m_concrete_allocator;
142};
143
144/*---------------------------------------------------------------------------*/
145/*---------------------------------------------------------------------------*/
146
147class UnifiedMemoryConcreteAllocator
148: public ConcreteAllocator
149{
150 public:
151
152 UnifiedMemoryConcreteAllocator()
153 {
154 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
155 m_use_ats = v.value();
156 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MEMORY_HINT_ON_DEVICE", true))
157 m_use_hint_as_mainly_device = (v.value() != 0);
158 }
159
160 cudaError_t _deallocate(void* ptr) final
161 {
162 if (m_use_ats) {
163 ::free(ptr);
164 return cudaSuccess;
165 }
166 //std::cout << "CUDA_MANAGED_FREE ptr=" << ptr << "\n";
167 return ::cudaFree(ptr);
168 }
169
170 cudaError_t _allocate(void** ptr, size_t new_size) final
171 {
172 if (m_use_ats) {
173 *ptr = ::aligned_alloc(128, new_size);
174 }
175 else {
176 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
177 //std::cout << "CUDA_MANAGED_MALLOC ptr=" << (*ptr) << " size=" << new_size << "\n";
178 //if (new_size < 4000)
179 //std::cout << "STACK=" << platform::getStackTrace() << "\n";
180
181 if (r != cudaSuccess)
182 return r;
183
184 // Si demandé, indique qu'on préfère allouer sur le GPU.
185 // NOTE: Dans ce cas, on récupère le device actuel pour positionner la localisation
186 // préférée. Dans le cas où on utilise MemoryPool, cette allocation ne sera effectuée
187 // qu'une seule fois. Si le device par défaut pour un thread change au cours du calcul
188 // il y aura une incohérence. Pour éviter cela, on pourrait faire un cudaMemAdvise()
189 // pour chaque allocation (via _applyHint()) mais ces opérations sont assez couteuses
190 // et s'il y a beaucoup d'allocation il peut en résulter une perte de performance.
192 int device_id = 0;
193 void* p = *ptr;
194 cudaGetDevice(&device_id);
195 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));
196 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));
197 }
198 }
199
200 return cudaSuccess;
201 }
202
203 constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }
204
205 public:
206
207 bool m_use_ats = false;
210};
211
212/*---------------------------------------------------------------------------*/
213/*---------------------------------------------------------------------------*/
221class UnifiedMemoryCudaMemoryAllocator
222: public AcceleratorMemoryAllocatorBase
223{
224 public:
225 public:
226
227 UnifiedMemoryCudaMemoryAllocator()
228 : AcceleratorMemoryAllocatorBase("UnifiedMemoryCudaMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())
229 {
230 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MALLOC_TRACE", true))
231 _setTraceLevel(v.value());
232 }
233
234 void initialize()
235 {
236 _doInitializeUVM(true);
237 }
238
239 public:
240
241 void notifyMemoryArgsChanged([[maybe_unused]] MemoryAllocationArgs old_args,
242 MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
243 {
244 void* p = ptr.baseAddress();
245 Int64 s = ptr.capacity();
246 if (p && s > 0)
247 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
248 }
249
250 protected:
251
252 void _applyHint(void* p, size_t new_size, MemoryAllocationArgs args)
253 {
254 eMemoryLocationHint hint = args.memoryLocationHint();
255 // Utilise le device actif pour positionner le GPU par défaut
256 // On ne le fait que si le \a hint le nécessite pour éviter d'appeler
257 // cudaGetDevice() à chaque fois.
258 int device_id = 0;
260 cudaGetDevice(&device_id);
261 }
262 auto device_memory_location = _getMemoryLocation(device_id);
263 auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);
264
265 //std::cout << "SET_MEMORY_HINT name=" << args.arrayName() << " size=" << new_size << " hint=" << (int)hint << "\n";
267 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));
268 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));
269 }
271 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));
272 //ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, 0));
273 }
275 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));
276 }
277 }
278 void _removeHint(void* p, size_t size, MemoryAllocationArgs args)
279 {
280 eMemoryLocationHint hint = args.memoryLocationHint();
281 if (hint == eMemoryLocationHint::None)
282 return;
283 int device_id = 0;
284 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));
285 }
286
287 private:
288
289 bool m_use_ats = false;
290};
291
292/*---------------------------------------------------------------------------*/
293/*---------------------------------------------------------------------------*/
294
296: public ConcreteAllocator
297{
298 public:
299
300 cudaError_t _allocate(void** ptr, size_t new_size) final
301 {
302 return ::cudaMallocHost(ptr, new_size);
303 }
304 cudaError_t _deallocate(void* ptr) final
305 {
306 return ::cudaFreeHost(ptr);
307 }
308 constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }
309};
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
313
314class HostPinnedCudaMemoryAllocator
315: public AcceleratorMemoryAllocatorBase
316{
317 public:
318 public:
319
320 HostPinnedCudaMemoryAllocator()
321 : AcceleratorMemoryAllocatorBase("HostPinnedCudaMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())
322 {
323 }
324
325 public:
326
327 void initialize()
328 {
330 }
331};
332
333/*---------------------------------------------------------------------------*/
334/*---------------------------------------------------------------------------*/
335
336class DeviceConcreteAllocator
337: public ConcreteAllocator
338{
339 public:
340
341 DeviceConcreteAllocator()
342 {
343 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
344 m_use_ats = v.value();
345 }
346
347 cudaError_t _allocate(void** ptr, size_t new_size) final
348 {
349 if (m_use_ats) {
350 // FIXME: it does not work on WIN32
351 *ptr = std::aligned_alloc(128, new_size);
352 if (*ptr)
353 return cudaSuccess;
354 return cudaErrorMemoryAllocation;
355 }
356 cudaError_t r = ::cudaMalloc(ptr, new_size);
357 //std::cout << "ALLOCATE_DEVICE ptr=" << (*ptr) << " size=" << new_size << " r=" << (int)r << "\n";
358 return r;
359 }
360 cudaError_t _deallocate(void* ptr) final
361 {
362 if (m_use_ats) {
363 std::free(ptr);
364 return cudaSuccess;
365 }
366 //std::cout << "FREE_DEVICE ptr=" << ptr << "\n";
367 return ::cudaFree(ptr);
368 }
369
370 constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }
371
372 private:
373
374 bool m_use_ats = false;
375};
376
377/*---------------------------------------------------------------------------*/
378/*---------------------------------------------------------------------------*/
379
380class DeviceCudaMemoryAllocator
381: public AcceleratorMemoryAllocatorBase
382{
383
384 public:
385
386 DeviceCudaMemoryAllocator()
387 : AcceleratorMemoryAllocatorBase("DeviceCudaMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())
388 {
389 }
390
391 public:
392
393 void initialize()
394 {
396 }
397};
398
399/*---------------------------------------------------------------------------*/
400/*---------------------------------------------------------------------------*/
401
402namespace
403{
404 UnifiedMemoryCudaMemoryAllocator unified_memory_cuda_memory_allocator;
405 HostPinnedCudaMemoryAllocator host_pinned_cuda_memory_allocator;
406 DeviceCudaMemoryAllocator device_cuda_memory_allocator;
407} // namespace
408
409/*---------------------------------------------------------------------------*/
410/*---------------------------------------------------------------------------*/
411
412void
413initializeCudaMemoryAllocators()
414{
415 unified_memory_cuda_memory_allocator.initialize();
416 device_cuda_memory_allocator.initialize();
417 host_pinned_cuda_memory_allocator.initialize();
418}
419
420void
421finalizeCudaMemoryAllocators(ITraceMng* tm)
422{
423 unified_memory_cuda_memory_allocator.finalize(tm);
424 device_cuda_memory_allocator.finalize(tm);
425 host_pinned_cuda_memory_allocator.finalize(tm);
426}
427
428/*---------------------------------------------------------------------------*/
429/*---------------------------------------------------------------------------*/
430
431void
432arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
433{
434 if (e == CUDA_SUCCESS)
435 return;
436 const char* error_name = nullptr;
437 CUresult e2 = cuGetErrorName(e, &error_name);
438 if (e2 != CUDA_SUCCESS)
439 error_name = "Unknown";
440
441 const char* error_message = nullptr;
442 CUresult e3 = cuGetErrorString(e, &error_message);
443 if (e3 != CUDA_SUCCESS)
444 error_message = "Unknown";
445
446 ARCCORE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
447 ti, e, error_name, error_message);
448}
449
450/*---------------------------------------------------------------------------*/
451/*---------------------------------------------------------------------------*/
461{
462 public:
463
464 Int32 getNbThreadPerBlock(const void* kernel_ptr)
465 {
466 std::scoped_lock lock(m_mutex);
467 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
468 if (x != m_nb_thread_per_block_map.end())
469 return x->second;
470 int min_grid_size = 0;
471 int computed_block_size = 0;
472 int wanted_shared_memory = 0;
473 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
474 if (r != cudaSuccess)
475 computed_block_size = 0;
476 int num_block_0 = 0;
477 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
478 int num_block_1 = 0;
479 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
480
481 cudaFuncAttributes func_attr;
482 cudaFuncGetAttributes(&func_attr, kernel_ptr);
483 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
484 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
485 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs;
486
487#if CUDART_VERSION >= 12040
488 // cudaFuncGetName is only available in 12.4
489 const char* func_name = nullptr;
490 cudaFuncGetName(&func_name, kernel_ptr);
491 std::cout << " name=" << func_name << "\n";
492#endif
493
494 return computed_block_size;
495 }
496
497 private:
498
499 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
500 std::mutex m_mutex;
501};
502
503/*---------------------------------------------------------------------------*/
504/*---------------------------------------------------------------------------*/
505
506class CudaRunQueueStream
508{
509 public:
510
511 CudaRunQueueStream(Impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
512 : m_runtime(runtime)
513 {
514 if (bi.isDefault())
515 ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
516 else {
517 int priority = bi.priority();
518 ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
519 }
520 }
521 ~CudaRunQueueStream() override
522 {
523 ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
524 }
525
526 public:
527
528 void notifyBeginLaunchKernel([[maybe_unused]] Impl::RunCommandImpl& c) override
529 {
530#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
531 auto kname = c.kernelName();
532 if (kname.empty())
533 nvtxRangePush(c.traceInfo().name());
534 else
535 nvtxRangePush(kname.localstr());
536#endif
537 return m_runtime->notifyBeginLaunchKernel();
538 }
540 {
541#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
542 nvtxRangePop();
543#endif
544 return m_runtime->notifyEndLaunchKernel();
545 }
546 void barrier() override
547 {
548 ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
549 if (global_cupti_flush > 0)
550 global_cupti_info.flush();
551 }
552 bool _barrierNoException() override
553 {
554 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
555 }
556 void copyMemory(const MemoryCopyArgs& args) override
557 {
558 auto source_bytes = args.source().bytes();
559 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
560 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
561 ARCCORE_CHECK_CUDA(r);
562 if (!args.isAsync())
563 barrier();
564 }
565 void prefetchMemory(const MemoryPrefetchArgs& args) override
566 {
567 auto src = args.source().bytes();
568 if (src.size() == 0)
569 return;
570 DeviceId d = args.deviceId();
571 int device = cudaCpuDeviceId;
572 if (!d.isHost())
573 device = d.asInt32();
574 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
575 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
576 auto mem_location = _getMemoryLocation(device);
577#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
578 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
579#else
580 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
581#endif
582 ARCCORE_CHECK_CUDA(r);
583 if (!args.isAsync())
584 barrier();
585 }
587 {
588 return Impl::NativeStream(&m_cuda_stream);
589 }
590
591 public:
592
593 cudaStream_t trueStream() const
594 {
595 return m_cuda_stream;
596 }
597
598 private:
599
600 Impl::IRunnerRuntime* m_runtime = nullptr;
601 cudaStream_t m_cuda_stream = nullptr;
602};
603
604/*---------------------------------------------------------------------------*/
605/*---------------------------------------------------------------------------*/
606
607class CudaRunQueueEvent
609{
610 public:
611
612 explicit CudaRunQueueEvent(bool has_timer)
613 {
614 if (has_timer)
615 ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
616 else
617 ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
618 }
619 ~CudaRunQueueEvent() override
620 {
621 ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
622 }
623
624 public:
625
626 // Enregistre l'événement au sein d'une RunQueue
627 void recordQueue(Impl::IRunQueueStream* stream) final
628 {
629 auto* rq = static_cast<CudaRunQueueStream*>(stream);
630 ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
631 }
632
633 void wait() final
634 {
635 ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
636 }
637
638 void waitForEvent(Impl::IRunQueueStream* stream) final
639 {
640 auto* rq = static_cast<CudaRunQueueStream*>(stream);
641 ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
642 }
643
644 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
645 {
646 // NOTE: Les évènements doivent avoir été créé avec le timer actif
647 ARCCORE_CHECK_POINTER(start_event);
648 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
649 float time_in_ms = 0.0;
650
651 // TODO: regarder si nécessaire
652 // ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
653
654 ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
655 double x = time_in_ms * 1.0e6;
656 Int64 nano_time = static_cast<Int64>(x);
657 return nano_time;
658 }
659
660 bool hasPendingWork() final
661 {
662 cudaError_t v = cudaEventQuery(m_cuda_event);
663 if (v == cudaErrorNotReady)
664 return true;
665 ARCCORE_CHECK_CUDA(v);
666 return false;
667 }
668
669 private:
670
671 cudaEvent_t m_cuda_event;
672};
673
674/*---------------------------------------------------------------------------*/
675/*---------------------------------------------------------------------------*/
676
679{
680 public:
681
682 ~CudaRunnerRuntime() override = default;
683
684 public:
685
686 void notifyBeginLaunchKernel() override
687 {
688 ++m_nb_kernel_launched;
689 if (m_is_verbose)
690 std::cout << "BEGIN CUDA KERNEL!\n";
691 }
692 void notifyEndLaunchKernel() override
693 {
694 ARCCORE_CHECK_CUDA(cudaGetLastError());
695 if (m_is_verbose)
696 std::cout << "END CUDA KERNEL!\n";
697 }
698 void barrier() override
699 {
700 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
701 }
702 eExecutionPolicy executionPolicy() const override
703 {
705 }
706 Impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
707 {
708 return new CudaRunQueueStream(this, bi);
709 }
710 Impl::IRunQueueEventImpl* createEventImpl() override
711 {
712 return new CudaRunQueueEvent(false);
713 }
714 Impl::IRunQueueEventImpl* createEventImplWithTimer() override
715 {
716 return new CudaRunQueueEvent(true);
717 }
718 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
719 {
720 auto v = buffer.bytes();
721 const void* ptr = v.data();
722 size_t count = v.size();
723 int device = device_id.asInt32();
724 cudaMemoryAdvise cuda_advise;
725
726 if (advice == eMemoryAdvice::MostlyRead)
727 cuda_advise = cudaMemAdviseSetReadMostly;
729 cuda_advise = cudaMemAdviseSetPreferredLocation;
730 else if (advice == eMemoryAdvice::AccessedByDevice)
731 cuda_advise = cudaMemAdviseSetAccessedBy;
732 else if (advice == eMemoryAdvice::PreferredLocationHost) {
733 cuda_advise = cudaMemAdviseSetPreferredLocation;
734 device = cudaCpuDeviceId;
735 }
736 else if (advice == eMemoryAdvice::AccessedByHost) {
737 cuda_advise = cudaMemAdviseSetAccessedBy;
738 device = cudaCpuDeviceId;
739 }
740 else
741 return;
742 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
743 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
744 }
745 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
746 {
747 auto v = buffer.bytes();
748 const void* ptr = v.data();
749 size_t count = v.size();
750 int device = device_id.asInt32();
751 cudaMemoryAdvise cuda_advise;
752
753 if (advice == eMemoryAdvice::MostlyRead)
754 cuda_advise = cudaMemAdviseUnsetReadMostly;
756 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
757 else if (advice == eMemoryAdvice::AccessedByDevice)
758 cuda_advise = cudaMemAdviseUnsetAccessedBy;
759 else if (advice == eMemoryAdvice::PreferredLocationHost) {
760 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
761 device = cudaCpuDeviceId;
762 }
763 else if (advice == eMemoryAdvice::AccessedByHost) {
764 cuda_advise = cudaMemAdviseUnsetAccessedBy;
765 device = cudaCpuDeviceId;
766 }
767 else
768 return;
769 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
770 }
771
772 void setCurrentDevice(DeviceId device_id) final
773 {
774 Int32 id = device_id.asInt32();
775 if (!device_id.isAccelerator())
776 ARCCORE_FATAL("Device {0} is not an accelerator device", id);
777 ARCCORE_CHECK_CUDA(cudaSetDevice(id));
778 }
779
780 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
781
782 void startProfiling() override
783 {
784 global_cupti_info.start();
785 }
786
787 void stopProfiling() override
788 {
789 global_cupti_info.stop();
790 }
791
792 bool isProfilingActive() override
793 {
794 return global_cupti_info.isActive();
795 }
796
797 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
798 {
799 cudaPointerAttributes ca;
800 ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
801 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
802 // que le type CUDA correspondant donc on peut faire un cast simple.
803 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
804 _fillPointerAttribute(attribute, mem_type, ca.device,
805 ptr, ca.devicePointer, ca.hostPointer);
806 }
807
808 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
809 {
810 int d = 0;
811 int wanted_d = device_id.asInt32();
812 ARCCORE_CHECK_CUDA(cudaGetDevice(&d));
813 if (d != wanted_d)
814 ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));
815 size_t free_mem = 0;
816 size_t total_mem = 0;
817 ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
818 if (d != wanted_d)
819 ARCCORE_CHECK_CUDA(cudaSetDevice(d));
821 dmi.setFreeMemory(free_mem);
822 dmi.setTotalMemory(total_mem);
823 return dmi;
824 }
825
826 void pushProfilerRange(const String& name, Int32 color_rgb) override
827 {
828#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
829 if (color_rgb >= 0) {
830 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
831 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
832 nvtxEventAttributes_t eventAttrib;
833 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
834 eventAttrib.version = NVTX_VERSION;
835 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
836 eventAttrib.colorType = NVTX_COLOR_ARGB;
837 eventAttrib.color = color_rgb;
838 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
839 eventAttrib.message.ascii = name.localstr();
840 nvtxRangePushEx(&eventAttrib);
841 }
842 else
843 nvtxRangePush(name.localstr());
844#endif
845 }
846 void popProfilerRange() override
847 {
848#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
849 nvtxRangePop();
850#endif
851 }
852
853 void finalize(ITraceMng* tm) override
854 {
855 finalizeCudaMemoryAllocators(tm);
856 }
857
858 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
859 const void* kernel_ptr,
860 Int64 total_loop_size) override
861 {
862 if (!m_use_computed_occupancy)
863 return orig_args;
864 Int32 wanted_shared_memory = orig_args.sharedMemorySize();
865 if (wanted_shared_memory < 0)
866 wanted_shared_memory = 0;
867 // Pour l'instant, on ne fait pas de calcul si la mémoire partagée est non nulle.
868 if (wanted_shared_memory != 0)
869 return orig_args;
870 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
871 if (computed_block_size == 0)
872 return orig_args;
873 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
874 int blocks_per_grid = CheckedConvert::toInt32(big_b);
875 return { blocks_per_grid, computed_block_size, wanted_shared_memory };
876 }
877
878 public:
879
880 void fillDevices(bool is_verbose);
881 void build()
882 {
883 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
884 m_use_computed_occupancy = v.value();
885 }
886
887 private:
888
889 Int64 m_nb_kernel_launched = 0;
890 bool m_is_verbose = false;
891 bool m_use_computed_occupancy = false;
892 Impl::DeviceInfoList m_device_info_list;
893 OccupancyMap m_occupancy_map;
894};
895
896/*---------------------------------------------------------------------------*/
897/*---------------------------------------------------------------------------*/
898
899void CudaRunnerRuntime::
900fillDevices(bool is_verbose)
901{
902 int nb_device = 0;
903 ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
904 std::ostream& omain = std::cout;
905 if (is_verbose)
906 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
907 for (int i = 0; i < nb_device; ++i) {
908 cudaDeviceProp dp;
909 cudaGetDeviceProperties(&dp, i);
910 int runtime_version = 0;
911 cudaRuntimeGetVersion(&runtime_version);
912 int driver_version = 0;
913 cudaDriverGetVersion(&driver_version);
914 std::ostringstream ostr;
915 std::ostream& o = ostr;
916 o << "Device " << i << " name=" << dp.name << "\n";
917 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
918 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
919 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
920 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
921 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
922 o << " sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor << "\n";
923 o << " sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin << "\n";
924 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
925 o << " warpSize = " << dp.warpSize << "\n";
926 o << " memPitch = " << dp.memPitch << "\n";
927 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
928 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
929 o << " totalConstMem = " << dp.totalConstMem << "\n";
930 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
931 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
932 o << " integrated = " << dp.integrated << "\n";
933 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
934 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
935 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
936 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
937 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
938 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
939 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
940 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
941 << " " << dp.maxThreadsDim[2] << "\n";
942 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
943 << " " << dp.maxGridSize[2] << "\n";
944 o << " pciInfo = " << dp.pciDomainID << " " << dp.pciBusID << " " << dp.pciDeviceID << "\n";
945#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)
946 o << " clockRate = " << dp.clockRate << "\n";
947 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
948 o << " computeMode = " << dp.computeMode << "\n";
949 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
950#endif
951
952 {
953 int least_val = 0;
954 int greatest_val = 0;
955 ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
956 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
957 }
958 std::ostringstream device_uuid_ostr;
959 {
960 CUdevice device;
961 ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));
962 CUuuid device_uuid;
963 ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
964 o << " deviceUuid=";
965 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
966 o << device_uuid_ostr.str();
967 o << "\n";
968 }
969 String description(ostr.str());
970 if (is_verbose)
971 omain << description;
972
973 DeviceInfo device_info;
974 device_info.setDescription(description);
975 device_info.setDeviceId(DeviceId(i));
976 device_info.setName(dp.name);
977 device_info.setWarpSize(dp.warpSize);
978 device_info.setUUIDAsString(device_uuid_ostr.str());
979 device_info.setSharedMemoryPerBlock(static_cast<Int32>(dp.sharedMemPerBlock));
980 device_info.setSharedMemoryPerMultiprocessor(static_cast<Int32>(dp.sharedMemPerMultiprocessor));
981 device_info.setSharedMemoryPerBlockOptin(static_cast<Int32>(dp.sharedMemPerBlockOptin));
982 device_info.setTotalConstMemory(static_cast<Int32>(dp.totalConstMem));
983 device_info.setPCIDomainID(dp.pciDomainID);
984 device_info.setPCIBusID(dp.pciBusID);
985 device_info.setPCIDeviceID(dp.pciDeviceID);
986 m_device_info_list.addDevice(device_info);
987 }
988
989 Int32 global_cupti_level = 0;
990
991 // Regarde si on active Cupti
992 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
993 global_cupti_level = v.value();
994 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
995 global_cupti_flush = v.value();
996 bool do_print_cupti = true;
997 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
998 do_print_cupti = (v.value() != 0);
999
1000 if (global_cupti_level > 0) {
1001#ifndef ARCCORE_HAS_CUDA_CUPTI
1002 ARCCORE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
1003#endif
1004 global_cupti_info.init(global_cupti_level, do_print_cupti);
1005 global_cupti_info.start();
1006 }
1007}
1008
1009/*---------------------------------------------------------------------------*/
1010/*---------------------------------------------------------------------------*/
1011
1013: public IMemoryCopier
1014{
1015 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,
1016 MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,
1017 const RunQueue* queue) override
1018 {
1019 if (queue) {
1020 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
1021 return;
1022 }
1023 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
1024 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
1025 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
1026 ARCCORE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
1027 }
1028};
1029
1030/*---------------------------------------------------------------------------*/
1031/*---------------------------------------------------------------------------*/
1032
1033} // End namespace Arcane::Accelerator::Cuda
1034
1035using namespace Arcane;
1036
1037namespace
1038{
1039Accelerator::Cuda::CudaRunnerRuntime global_cuda_runtime;
1040Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
1041
1042void _setAllocator(Accelerator::AcceleratorMemoryAllocatorBase* allocator)
1043{
1045 eMemoryResource mem = allocator->memoryResource();
1046 mrm->setAllocator(mem, allocator);
1047 mrm->setMemoryPool(mem, allocator->memoryPool());
1048}
1049
1050} // namespace
1051
1052/*---------------------------------------------------------------------------*/
1053/*---------------------------------------------------------------------------*/
1054
1055// Cette fonction est le point d'entrée utilisé lors du chargement
1056// dynamique de cette bibliothèque
1057extern "C" ARCCORE_EXPORT void
1058arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
1059{
1060 using namespace Arcane::Accelerator::Cuda;
1061 global_cuda_runtime.build();
1062 Accelerator::Impl::setUsingCUDARuntime(true);
1063 Accelerator::Impl::setCUDARunQueueRuntime(&global_cuda_runtime);
1064 initializeCudaMemoryAllocators();
1066 MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_cuda_memory_allocator);
1067 IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();
1068 mrm->setIsAccelerator(true);
1069 _setAllocator(&unified_memory_cuda_memory_allocator);
1070 _setAllocator(&host_pinned_cuda_memory_allocator);
1071 _setAllocator(&device_cuda_memory_allocator);
1072 mrm->setCopier(&global_cuda_memory_copier);
1073 global_cuda_runtime.fillDevices(init_info.isVerbose());
1074}
1075
1076/*---------------------------------------------------------------------------*/
1077/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro envoyant une exception FatalErrorException.
#define ARCCORE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
Classe de base d'un allocateur spécifique pour accélérateur.
eMemoryResource memoryResource() const final
Ressource mémoire fournie par l'allocateur.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Definition Cupti.h:38
Map contenant l'occupation idéale pour un kernel donné.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
bool m_use_hint_as_mainly_device
Si vrai, par défaut on considère toutes les allocations comme eMemoryLocationHint::MainlyDevice.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifie du changement des arguments spécifiques à l'instance.
bool isHost() const
Indique si l'instance est associée à l'hôte.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Informations sur une zone mémoire allouée.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Positionne le pool mémoire pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Classe contenant des informations pour spécialiser les allocations.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:537
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:325
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryLocationHint
Indices sur la localisation mémoire attendue.
@ MainlyHost
Indique que la donnée sera plutôt utilisée sur CPU.
@ HostAndDeviceMostlyRead
Indique que la donnée sera utilisée à la fois sur accélérateur et sur CPU et qu'elle ne sera pas souv...
@ MainlyDevice
Indique que la donnée sera plutôt utilisée sur accélérateur.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.