Arcane  v4.1.1.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2025 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/CudaAccelerator.h"
15
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
18
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
35
36#include "arccore/accelerator_native/runtime/Cupti.h"
37
38#include <sstream>
39#include <unordered_map>
40#include <mutex>
41
42#include <cuda.h>
43
44// Pour std::memset
45#include <cstring>
46
47#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
48#include <nvtx3/nvToolsExt.h>
49#endif
50
51namespace Arcane::Accelerator::Cuda
52{
53using Impl::KernelLaunchArgs;
54
55namespace
56{
57 Int32 global_cupti_flush = 0;
58 CuptiInfo global_cupti_info;
59} // namespace
60
61
62/*---------------------------------------------------------------------------*/
63/*---------------------------------------------------------------------------*/
64
65// A partir de CUDA 13, il y a un nouveau type cudaMemLocation
66// pour les méthodes telles cudeMemAdvise ou cudaMemPrefetch
67#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
68inline cudaMemLocation
69_getMemoryLocation(int device_id)
70{
71 cudaMemLocation mem_location;
72 mem_location.type = cudaMemLocationTypeDevice;
73 mem_location.id = device_id;
74 if (device_id == cudaCpuDeviceId)
75 mem_location.type = cudaMemLocationTypeHost;
76 else {
77 mem_location.type = cudaMemLocationTypeDevice;
78 mem_location.id = device_id;
79 }
80 return mem_location;
81}
82#else
83inline int
84_getMemoryLocation(int device_id)
85{
86 return device_id;
87}
88#endif
89
90/*---------------------------------------------------------------------------*/
91/*---------------------------------------------------------------------------*/
92
94{
95 public:
96
97 virtual ~ConcreteAllocator() = default;
98
99 public:
100
101 virtual cudaError_t _allocate(void** ptr, size_t new_size) = 0;
102 virtual cudaError_t _deallocate(void* ptr) = 0;
103};
104
105/*---------------------------------------------------------------------------*/
106/*---------------------------------------------------------------------------*/
107
108template <typename ConcreteAllocatorType>
109class UnderlyingAllocator
111{
112 public:
113
114 UnderlyingAllocator() = default;
115
116 public:
117
118 void* allocateMemory(size_t size) final
119 {
120 void* out = nullptr;
121 ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));
122 return out;
123 }
124 void freeMemory(void* ptr, [[maybe_unused]] size_t size) final
125 {
126 ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));
127 }
128
129 void doMemoryCopy(void* destination, const void* source, Int64 size) final
130 {
131 ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));
132 }
133
134 eMemoryResource memoryResource() const final
135 {
136 return m_concrete_allocator.memoryResource();
137 }
138
139 public:
140
141 ConcreteAllocatorType m_concrete_allocator;
142};
143
144/*---------------------------------------------------------------------------*/
145/*---------------------------------------------------------------------------*/
146
147class UnifiedMemoryConcreteAllocator
148: public ConcreteAllocator
149{
150 public:
151
152 UnifiedMemoryConcreteAllocator()
153 {
154 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
155 m_use_ats = v.value();
156 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MEMORY_HINT_ON_DEVICE", true))
157 m_use_hint_as_mainly_device = (v.value() != 0);
158 }
159
160 cudaError_t _deallocate(void* ptr) final
161 {
162 if (m_use_ats) {
163 ::free(ptr);
164 return cudaSuccess;
165 }
166 //std::cout << "CUDA_MANAGED_FREE ptr=" << ptr << "\n";
167 return ::cudaFree(ptr);
168 }
169
170 cudaError_t _allocate(void** ptr, size_t new_size) final
171 {
172 if (m_use_ats) {
173 *ptr = ::aligned_alloc(128, new_size);
174 }
175 else {
176 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
177 //std::cout << "CUDA_MANAGED_MALLOC ptr=" << (*ptr) << " size=" << new_size << "\n";
178 //if (new_size < 4000)
179 //std::cout << "STACK=" << platform::getStackTrace() << "\n";
180
181 if (r != cudaSuccess)
182 return r;
183
184 // Si demandé, indique qu'on préfère allouer sur le GPU.
185 // NOTE: Dans ce cas, on récupère le device actuel pour positionner la localisation
186 // préférée. Dans le cas où on utilise MemoryPool, cette allocation ne sera effectuée
187 // qu'une seule fois. Si le device par défaut pour un thread change au cours du calcul
188 // il y aura une incohérence. Pour éviter cela, on pourrait faire un cudaMemAdvise()
189 // pour chaque allocation (via _applyHint()) mais ces opérations sont assez couteuses
190 // et s'il y a beaucoup d'allocation il peut en résulter une perte de performance.
192 int device_id = 0;
193 void* p = *ptr;
194 cudaGetDevice(&device_id);
195 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));
196 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));
197 }
198 }
199
200 return cudaSuccess;
201 }
202
203 constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }
204
205 public:
206
207 bool m_use_ats = false;
210};
211
212/*---------------------------------------------------------------------------*/
213/*---------------------------------------------------------------------------*/
221class UnifiedMemoryCudaMemoryAllocator
222: public AcceleratorMemoryAllocatorBase
223{
224 public:
225 public:
226
227 UnifiedMemoryCudaMemoryAllocator()
228 : AcceleratorMemoryAllocatorBase("UnifiedMemoryCudaMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())
229 {
230 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MALLOC_TRACE", true))
231 _setTraceLevel(v.value());
232 }
233
234 void initialize()
235 {
236 _doInitializeUVM(true);
237 }
238
239 public:
240
241 void notifyMemoryArgsChanged([[maybe_unused]] MemoryAllocationArgs old_args,
242 MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
243 {
244 void* p = ptr.baseAddress();
245 Int64 s = ptr.capacity();
246 if (p && s > 0)
247 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
248 }
249
250 protected:
251
252 void _applyHint(void* p, size_t new_size, MemoryAllocationArgs args)
253 {
254 eMemoryLocationHint hint = args.memoryLocationHint();
255 // Utilise le device actif pour positionner le GPU par défaut
256 // On ne le fait que si le \a hint le nécessite pour éviter d'appeler
257 // cudaGetDevice() à chaque fois.
258 int device_id = 0;
260 cudaGetDevice(&device_id);
261 }
262 auto device_memory_location = _getMemoryLocation(device_id);
263 auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);
264
265 //std::cout << "SET_MEMORY_HINT name=" << args.arrayName() << " size=" << new_size << " hint=" << (int)hint << "\n";
267 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));
268 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));
269 }
271 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));
272 //ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, 0));
273 }
275 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));
276 }
277 }
278 void _removeHint(void* p, size_t size, MemoryAllocationArgs args)
279 {
280 eMemoryLocationHint hint = args.memoryLocationHint();
281 if (hint == eMemoryLocationHint::None)
282 return;
283 int device_id = 0;
284 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));
285 }
286
287 private:
288
289 bool m_use_ats = false;
290};
291
292/*---------------------------------------------------------------------------*/
293/*---------------------------------------------------------------------------*/
294
296: public ConcreteAllocator
297{
298 public:
299
300 cudaError_t _allocate(void** ptr, size_t new_size) final
301 {
302 return ::cudaMallocHost(ptr, new_size);
303 }
304 cudaError_t _deallocate(void* ptr) final
305 {
306 return ::cudaFreeHost(ptr);
307 }
308 constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }
309};
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
313
314class HostPinnedCudaMemoryAllocator
315: public AcceleratorMemoryAllocatorBase
316{
317 public:
318 public:
319
320 HostPinnedCudaMemoryAllocator()
321 : AcceleratorMemoryAllocatorBase("HostPinnedCudaMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())
322 {
323 }
324
325 public:
326
327 void initialize()
328 {
330 }
331};
332
333/*---------------------------------------------------------------------------*/
334/*---------------------------------------------------------------------------*/
335
336class DeviceConcreteAllocator
337: public ConcreteAllocator
338{
339 public:
340
341 DeviceConcreteAllocator()
342 {
343 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
344 m_use_ats = v.value();
345 }
346
347 cudaError_t _allocate(void** ptr, size_t new_size) final
348 {
349 if (m_use_ats) {
350 // FIXME: it does not work on WIN32
351 *ptr = std::aligned_alloc(128, new_size);
352 if (*ptr)
353 return cudaSuccess;
354 return cudaErrorMemoryAllocation;
355 }
356 cudaError_t r = ::cudaMalloc(ptr, new_size);
357 //std::cout << "ALLOCATE_DEVICE ptr=" << (*ptr) << " size=" << new_size << " r=" << (int)r << "\n";
358 return r;
359 }
360 cudaError_t _deallocate(void* ptr) final
361 {
362 if (m_use_ats) {
363 std::free(ptr);
364 return cudaSuccess;
365 }
366 //std::cout << "FREE_DEVICE ptr=" << ptr << "\n";
367 return ::cudaFree(ptr);
368 }
369
370 constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }
371
372 private:
373
374 bool m_use_ats = false;
375};
376
377/*---------------------------------------------------------------------------*/
378/*---------------------------------------------------------------------------*/
379
380class DeviceCudaMemoryAllocator
381: public AcceleratorMemoryAllocatorBase
382{
383
384 public:
385
386 DeviceCudaMemoryAllocator()
387 : AcceleratorMemoryAllocatorBase("DeviceCudaMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())
388 {
389 }
390
391 public:
392
393 void initialize()
394 {
396 }
397};
398
399/*---------------------------------------------------------------------------*/
400/*---------------------------------------------------------------------------*/
401
402namespace
403{
404 UnifiedMemoryCudaMemoryAllocator unified_memory_cuda_memory_allocator;
405 HostPinnedCudaMemoryAllocator host_pinned_cuda_memory_allocator;
406 DeviceCudaMemoryAllocator device_cuda_memory_allocator;
407} // namespace
408
409/*---------------------------------------------------------------------------*/
410/*---------------------------------------------------------------------------*/
411
412void
413initializeCudaMemoryAllocators()
414{
415 unified_memory_cuda_memory_allocator.initialize();
416 device_cuda_memory_allocator.initialize();
417 host_pinned_cuda_memory_allocator.initialize();
418}
419
420void
421finalizeCudaMemoryAllocators(ITraceMng* tm)
422{
423 unified_memory_cuda_memory_allocator.finalize(tm);
424 device_cuda_memory_allocator.finalize(tm);
425 host_pinned_cuda_memory_allocator.finalize(tm);
426}
427
428/*---------------------------------------------------------------------------*/
429/*---------------------------------------------------------------------------*/
430
431void
432arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
433{
434 if (e == CUDA_SUCCESS)
435 return;
436 const char* error_name = nullptr;
437 CUresult e2 = cuGetErrorName(e, &error_name);
438 if (e2 != CUDA_SUCCESS)
439 error_name = "Unknown";
440
441 const char* error_message = nullptr;
442 CUresult e3 = cuGetErrorString(e, &error_message);
443 if (e3 != CUDA_SUCCESS)
444 error_message = "Unknown";
445
446 ARCCORE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
447 ti, e, error_name, error_message);
448}
449
450/*---------------------------------------------------------------------------*/
451/*---------------------------------------------------------------------------*/
461{
462 public:
463
464 Int32 getNbThreadPerBlock(const void* kernel_ptr)
465 {
466 std::scoped_lock lock(m_mutex);
467 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
468 if (x != m_nb_thread_per_block_map.end())
469 return x->second;
470 int min_grid_size = 0;
471 int computed_block_size = 0;
472 int wanted_shared_memory = 0;
473 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
474 if (r != cudaSuccess)
475 computed_block_size = 0;
476 int num_block_0 = 0;
477 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
478 int num_block_1 = 0;
479 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
480
481 cudaFuncAttributes func_attr;
482 cudaFuncGetAttributes(&func_attr, kernel_ptr);
483 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
484 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
485 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs;
486
487#if CUDART_VERSION >= 12040
488 // cudaFuncGetName is only available in 12.4
489 const char* func_name = nullptr;
490 cudaFuncGetName(&func_name, kernel_ptr);
491 std::cout << " name=" << func_name << "\n";
492#endif
493
494 return computed_block_size;
495 }
496
497 private:
498
499 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
500 std::mutex m_mutex;
501};
502
503/*---------------------------------------------------------------------------*/
504/*---------------------------------------------------------------------------*/
505
506class CudaRunQueueStream
508{
509 public:
510
511 CudaRunQueueStream(impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
512 : m_runtime(runtime)
513 {
514 if (bi.isDefault())
515 ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
516 else {
517 int priority = bi.priority();
518 ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
519 }
520 }
521 ~CudaRunQueueStream() override
522 {
523 ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
524 }
525
526 public:
527
528 void notifyBeginLaunchKernel([[maybe_unused]] impl::RunCommandImpl& c) override
529 {
530#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
531 auto kname = c.kernelName();
532 if (kname.empty())
533 nvtxRangePush(c.traceInfo().name());
534 else
535 nvtxRangePush(kname.localstr());
536#endif
537 return m_runtime->notifyBeginLaunchKernel();
538 }
540 {
541#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
542 nvtxRangePop();
543#endif
544 return m_runtime->notifyEndLaunchKernel();
545 }
546 void barrier() override
547 {
548 ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
549 if (global_cupti_flush > 0)
550 global_cupti_info.flush();
551 }
552 bool _barrierNoException() override
553 {
554 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
555 }
556 void copyMemory(const MemoryCopyArgs& args) override
557 {
558 auto source_bytes = args.source().bytes();
559 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
560 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
561 ARCCORE_CHECK_CUDA(r);
562 if (!args.isAsync())
563 barrier();
564 }
565 void prefetchMemory(const MemoryPrefetchArgs& args) override
566 {
567 auto src = args.source().bytes();
568 if (src.size() == 0)
569 return;
570 DeviceId d = args.deviceId();
571 int device = cudaCpuDeviceId;
572 if (!d.isHost())
573 device = d.asInt32();
574 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
575 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
576 auto mem_location = _getMemoryLocation(device);
577#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
578 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
579#else
580 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
581#endif
582 ARCCORE_CHECK_CUDA(r);
583 if (!args.isAsync())
584 barrier();
585 }
587 {
588 return Impl::NativeStream(&m_cuda_stream);
589 }
590
591 public:
592
593 cudaStream_t trueStream() const
594 {
595 return m_cuda_stream;
596 }
597
598 private:
599
600 impl::IRunnerRuntime* m_runtime = nullptr;
601 cudaStream_t m_cuda_stream = nullptr;
602};
603
604/*---------------------------------------------------------------------------*/
605/*---------------------------------------------------------------------------*/
606
607class CudaRunQueueEvent
609{
610 public:
611
612 explicit CudaRunQueueEvent(bool has_timer)
613 {
614 if (has_timer)
615 ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
616 else
617 ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
618 }
619 ~CudaRunQueueEvent() override
620 {
621 ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
622 }
623
624 public:
625
626 // Enregistre l'événement au sein d'une RunQueue
627 void recordQueue(impl::IRunQueueStream* stream) final
628 {
629 auto* rq = static_cast<CudaRunQueueStream*>(stream);
630 ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
631 }
632
633 void wait() final
634 {
635 ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
636 }
637
638 void waitForEvent(impl::IRunQueueStream* stream) final
639 {
640 auto* rq = static_cast<CudaRunQueueStream*>(stream);
641 ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
642 }
643
644 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
645 {
646 // NOTE: Les évènements doivent avoir été créé avec le timer actif
647 ARCCORE_CHECK_POINTER(start_event);
648 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
649 float time_in_ms = 0.0;
650
651 // TODO: regarder si nécessaire
652 // ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
653
654 ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
655 double x = time_in_ms * 1.0e6;
656 Int64 nano_time = static_cast<Int64>(x);
657 return nano_time;
658 }
659
660 bool hasPendingWork() final
661 {
662 cudaError_t v = cudaEventQuery(m_cuda_event);
663 if (v == cudaErrorNotReady)
664 return true;
665 ARCCORE_CHECK_CUDA(v);
666 return false;
667 }
668
669 private:
670
671 cudaEvent_t m_cuda_event;
672};
673
674/*---------------------------------------------------------------------------*/
675/*---------------------------------------------------------------------------*/
676
679{
680 public:
681
682 ~CudaRunnerRuntime() override = default;
683
684 public:
685
686 void notifyBeginLaunchKernel() override
687 {
688 ++m_nb_kernel_launched;
689 if (m_is_verbose)
690 std::cout << "BEGIN CUDA KERNEL!\n";
691 }
692 void notifyEndLaunchKernel() override
693 {
694 ARCCORE_CHECK_CUDA(cudaGetLastError());
695 if (m_is_verbose)
696 std::cout << "END CUDA KERNEL!\n";
697 }
698 void barrier() override
699 {
700 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
701 }
702 eExecutionPolicy executionPolicy() const override
703 {
705 }
706 impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
707 {
708 return new CudaRunQueueStream(this, bi);
709 }
710 impl::IRunQueueEventImpl* createEventImpl() override
711 {
712 return new CudaRunQueueEvent(false);
713 }
714 impl::IRunQueueEventImpl* createEventImplWithTimer() override
715 {
716 return new CudaRunQueueEvent(true);
717 }
718 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
719 {
720 auto v = buffer.bytes();
721 const void* ptr = v.data();
722 size_t count = v.size();
723 int device = device_id.asInt32();
724 cudaMemoryAdvise cuda_advise;
725
726 if (advice == eMemoryAdvice::MostlyRead)
727 cuda_advise = cudaMemAdviseSetReadMostly;
729 cuda_advise = cudaMemAdviseSetPreferredLocation;
730 else if (advice == eMemoryAdvice::AccessedByDevice)
731 cuda_advise = cudaMemAdviseSetAccessedBy;
732 else if (advice == eMemoryAdvice::PreferredLocationHost) {
733 cuda_advise = cudaMemAdviseSetPreferredLocation;
734 device = cudaCpuDeviceId;
735 }
736 else if (advice == eMemoryAdvice::AccessedByHost) {
737 cuda_advise = cudaMemAdviseSetAccessedBy;
738 device = cudaCpuDeviceId;
739 }
740 else
741 return;
742 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
743 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
744 }
745 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
746 {
747 auto v = buffer.bytes();
748 const void* ptr = v.data();
749 size_t count = v.size();
750 int device = device_id.asInt32();
751 cudaMemoryAdvise cuda_advise;
752
753 if (advice == eMemoryAdvice::MostlyRead)
754 cuda_advise = cudaMemAdviseUnsetReadMostly;
756 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
757 else if (advice == eMemoryAdvice::AccessedByDevice)
758 cuda_advise = cudaMemAdviseUnsetAccessedBy;
759 else if (advice == eMemoryAdvice::PreferredLocationHost) {
760 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
761 device = cudaCpuDeviceId;
762 }
763 else if (advice == eMemoryAdvice::AccessedByHost) {
764 cuda_advise = cudaMemAdviseUnsetAccessedBy;
765 device = cudaCpuDeviceId;
766 }
767 else
768 return;
769 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
770 }
771
772 void setCurrentDevice(DeviceId device_id) final
773 {
774 Int32 id = device_id.asInt32();
775 if (!device_id.isAccelerator())
776 ARCCORE_FATAL("Device {0} is not an accelerator device", id);
777 ARCCORE_CHECK_CUDA(cudaSetDevice(id));
778 }
779
780 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
781
782 void startProfiling() override
783 {
784 global_cupti_info.start();
785 }
786
787 void stopProfiling() override
788 {
789 global_cupti_info.stop();
790 }
791
792 bool isProfilingActive() override
793 {
794 return global_cupti_info.isActive();
795 }
796
797 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
798 {
799 cudaPointerAttributes ca;
800 ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
801 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
802 // que le type CUDA correspondant donc on peut faire un cast simple.
803 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
804 _fillPointerAttribute(attribute, mem_type, ca.device,
805 ptr, ca.devicePointer, ca.hostPointer);
806 }
807
808 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
809 {
810 int d = 0;
811 int wanted_d = device_id.asInt32();
812 ARCCORE_CHECK_CUDA(cudaGetDevice(&d));
813 if (d != wanted_d)
814 ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));
815 size_t free_mem = 0;
816 size_t total_mem = 0;
817 ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
818 if (d != wanted_d)
819 ARCCORE_CHECK_CUDA(cudaSetDevice(d));
821 dmi.setFreeMemory(free_mem);
822 dmi.setTotalMemory(total_mem);
823 return dmi;
824 }
825
826 void pushProfilerRange(const String& name, Int32 color_rgb) override
827 {
828#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
829 if (color_rgb >= 0) {
830 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
831 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
832 nvtxEventAttributes_t eventAttrib;
833 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
834 eventAttrib.version = NVTX_VERSION;
835 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
836 eventAttrib.colorType = NVTX_COLOR_ARGB;
837 eventAttrib.color = color_rgb;
838 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
839 eventAttrib.message.ascii = name.localstr();
840 nvtxRangePushEx(&eventAttrib);
841 }
842 else
843 nvtxRangePush(name.localstr());
844#endif
845 }
846 void popProfilerRange() override
847 {
848#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
849 nvtxRangePop();
850#endif
851 }
852
853 void finalize(ITraceMng* tm) override
854 {
855 finalizeCudaMemoryAllocators(tm);
856 }
857
858 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
859 const void* kernel_ptr,
860 Int64 total_loop_size) override
861 {
862 if (!m_use_computed_occupancy)
863 return orig_args;
864 Int32 wanted_shared_memory = orig_args.sharedMemorySize();
865 if (wanted_shared_memory < 0)
866 wanted_shared_memory = 0;
867 // Pour l'instant, on ne fait pas de calcul si la mémoire partagée est non nulle.
868 if (wanted_shared_memory != 0)
869 return orig_args;
870 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
871 if (computed_block_size == 0)
872 return orig_args;
873 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
874 int blocks_per_grid = CheckedConvert::toInt32(big_b);
875 return { blocks_per_grid, computed_block_size, wanted_shared_memory };
876 }
877
878 public:
879
880 void fillDevices(bool is_verbose);
881 void build()
882 {
883 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
884 m_use_computed_occupancy = v.value();
885 }
886
887 private:
888
889 Int64 m_nb_kernel_launched = 0;
890 bool m_is_verbose = false;
891 bool m_use_computed_occupancy = false;
892 impl::DeviceInfoList m_device_info_list;
893 OccupancyMap m_occupancy_map;
894};
895
896/*---------------------------------------------------------------------------*/
897/*---------------------------------------------------------------------------*/
898
899void CudaRunnerRuntime::
900fillDevices(bool is_verbose)
901{
902 int nb_device = 0;
903 ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
904 std::ostream& omain = std::cout;
905 if (is_verbose)
906 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
907 for (int i = 0; i < nb_device; ++i) {
908 cudaDeviceProp dp;
909 cudaGetDeviceProperties(&dp, i);
910 int runtime_version = 0;
911 cudaRuntimeGetVersion(&runtime_version);
912 int driver_version = 0;
913 cudaDriverGetVersion(&driver_version);
914 std::ostringstream ostr;
915 std::ostream& o = ostr;
916 o << "Device " << i << " name=" << dp.name << "\n";
917 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
918 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
919 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
920 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
921 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
922 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
923 o << " warpSize = " << dp.warpSize << "\n";
924 o << " memPitch = " << dp.memPitch << "\n";
925 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
926 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
927 o << " totalConstMem = " << dp.totalConstMem << "\n";
928 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
929 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
930 o << " integrated = " << dp.integrated << "\n";
931 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
932 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
933 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
934 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
935 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
936 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
937 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
938 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
939 << " " << dp.maxThreadsDim[2] << "\n";
940 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
941 << " " << dp.maxGridSize[2] << "\n";
942#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)
943 o << " clockRate = " << dp.clockRate << "\n";
944 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
945 o << " computeMode = " << dp.computeMode << "\n";
946 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
947#endif
948
949 {
950 int least_val = 0;
951 int greatest_val = 0;
952 ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
953 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
954 }
955 {
956 CUdevice device;
957 ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));
958 CUuuid device_uuid;
959 ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
960 o << " deviceUuid=";
961 impl::printUUID(o, device_uuid.bytes);
962 o << "\n";
963 }
964 String description(ostr.str());
965 if (is_verbose)
966 omain << description;
967
968 DeviceInfo device_info;
969 device_info.setDescription(description);
970 device_info.setDeviceId(DeviceId(i));
971 device_info.setName(dp.name);
972 device_info.setWarpSize(dp.warpSize);
973 m_device_info_list.addDevice(device_info);
974 }
975
976 Int32 global_cupti_level = 0;
977
978 // Regarde si on active Cupti
979 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
980 global_cupti_level = v.value();
981 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
982 global_cupti_flush = v.value();
983 bool do_print_cupti = true;
984 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
985 do_print_cupti = (v.value() != 0);
986
987 if (global_cupti_level > 0) {
988#ifndef ARCCORE_HAS_CUDA_CUPTI
989 ARCCORE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
990#endif
991 global_cupti_info.init(global_cupti_level, do_print_cupti);
992 global_cupti_info.start();
993 }
994}
995
996/*---------------------------------------------------------------------------*/
997/*---------------------------------------------------------------------------*/
998
1000: public IMemoryCopier
1001{
1002 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,
1003 MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,
1004 const RunQueue* queue) override
1005 {
1006 if (queue) {
1007 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
1008 return;
1009 }
1010 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
1011 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
1012 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
1013 ARCCORE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
1014 }
1015};
1016
1017/*---------------------------------------------------------------------------*/
1018/*---------------------------------------------------------------------------*/
1019
1020} // End namespace Arcane::Accelerator::Cuda
1021
1022namespace
1023{
1025Arcane::Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
1026} // namespace
1027
1028/*---------------------------------------------------------------------------*/
1029/*---------------------------------------------------------------------------*/
1030
1031// Cette fonction est le point d'entrée utilisé lors du chargement
1032// dynamique de cette bibliothèque
1033extern "C" ARCCORE_EXPORT void
1034arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
1035{
1036 using namespace Arcane;
1037 using namespace Arcane::Accelerator::Cuda;
1038 global_cuda_runtime.build();
1039 Arcane::Accelerator::impl::setUsingCUDARuntime(true);
1040 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
1041 initializeCudaMemoryAllocators();
1043 MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_cuda_memory_allocator);
1045 mrm->setIsAccelerator(true);
1046 mrm->setAllocator(eMemoryResource::UnifiedMemory, &unified_memory_cuda_memory_allocator);
1047 mrm->setAllocator(eMemoryResource::HostPinned, &host_pinned_cuda_memory_allocator);
1048 mrm->setAllocator(eMemoryResource::Device, &device_cuda_memory_allocator);
1049 mrm->setCopier(&global_cuda_memory_copier);
1050 global_cuda_runtime.fillDevices(init_info.isVerbose());
1051}
1052
1053/*---------------------------------------------------------------------------*/
1054/*---------------------------------------------------------------------------*/
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Definition Cupti.h:38
Map contenant l'occupation idéale pour un kernel donné.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
bool m_use_hint_as_mainly_device
Si vrai, par défaut on considère toutes les allocations comme eMemoryLocationHint::MainlyDevice.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifie du changement des arguments spécifiques à l'instance.
bool isHost() const
Indique si l'instance est associée à l'hôte.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Informations pour initialiser le runtime accélérateur.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Informations sur une zone mémoire allouée.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
static ARCCORE_BASE_EXPORT std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Classe contenant des informations pour spécialiser les allocations.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:537
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:325
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryLocationHint
Indices sur la localisation mémoire attendue.
@ MainlyHost
Indique que la donnée sera plutôt utilisée sur CPU.
@ HostAndDeviceMostlyRead
Indique que la donnée sera utilisée à la fois sur accélérateur et sur CPU et qu'elle ne sera pas souv...
@ MainlyDevice
Indique que la donnée sera plutôt utilisée sur accélérateur.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.