14#include "arcane/accelerator/cuda/CudaAccelerator.h"
16#include "arccore/base/MemoryView.h"
17#include "arccore/base/PlatformUtils.h"
18#include "arccore/base/TraceInfo.h"
19#include "arccore/base/NotSupportedException.h"
20#include "arccore/base/FatalErrorException.h"
21#include "arccore/base/NotImplementedException.h"
23#include "arccore/common/IMemoryResourceMng.h"
24#include "arccore/common/internal/IMemoryResourceMngInternal.h"
26#include "arcane/utils/Array.h"
27#include "arcane/utils/OStringStream.h"
28#include "arcane/utils/ValueConvert.h"
29#include "arcane/utils/CheckedConvert.h"
30#include "arcane/utils/internal/MemoryUtilsInternal.h"
32#include "arcane/accelerator/core/RunQueueBuildInfo.h"
33#include "arcane/accelerator/core/Memory.h"
34#include "arcane/accelerator/core/DeviceInfoList.h"
35#include "arcane/accelerator/core/KernelLaunchArgs.h"
37#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
38#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
39#include "arcane/accelerator/core/internal/RunCommandImpl.h"
40#include "arcane/accelerator/core/internal/IRunQueueStream.h"
41#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
42#include "arcane/accelerator/core/PointerAttribute.h"
43#include "arcane/accelerator/core/RunQueue.h"
44#include "arcane/accelerator/core/DeviceMemoryInfo.h"
45#include "arcane/accelerator/core/NativeStream.h"
47#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
50#include <unordered_map>
55#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
56#include <nvtx3/nvToolsExt.h>
61namespace Arcane::Accelerator::Cuda
63using impl::KernelLaunchArgs;
67 Int32 global_cupti_flush = 0;
74void arcaneCheckCudaErrors(
const TraceInfo& ti, CUresult e)
76 if (e == CUDA_SUCCESS)
78 const char* error_name =
nullptr;
79 CUresult e2 = cuGetErrorName(e, &error_name);
80 if (e2 != CUDA_SUCCESS)
81 error_name =
"Unknown";
83 const char* error_message =
nullptr;
84 CUresult e3 = cuGetErrorString(e, &error_message);
85 if (e3 != CUDA_SUCCESS)
86 error_message =
"Unknown";
88 ARCANE_FATAL(
"CUDA Error trace={0} e={1} name={2} message={3}",
89 ti, e, error_name, error_message);
106 Int32 getNbThreadPerBlock(
const void* kernel_ptr)
108 std::scoped_lock lock(m_mutex);
109 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
110 if (x != m_nb_thread_per_block_map.end())
112 int min_grid_size = 0;
113 int computed_block_size = 0;
114 int wanted_shared_memory = 0;
115 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
116 if (r != cudaSuccess)
117 computed_block_size = 0;
119 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
121 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
123 cudaFuncAttributes func_attr;
124 cudaFuncGetAttributes(&func_attr, kernel_ptr);
125 const char* func_name =
nullptr;
126 cudaFuncGetName(&func_name, kernel_ptr);
127 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
128 std::cout <<
"ComputedBlockSize=" << computed_block_size <<
" n0=" << num_block_0 <<
" n1=" << num_block_1
129 <<
" min_grid_size=" << min_grid_size <<
" nb_reg=" << func_attr.numRegs
130 <<
" name=" << func_name <<
"\n";
131 return computed_block_size;
136 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
143class CudaRunQueueStream
152 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
154 int priority = bi.priority();
155 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
158 ~CudaRunQueueStream()
override
160 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
167#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
168 auto kname = c.kernelName();
170 nvtxRangePush(c.traceInfo().name());
172 nvtxRangePush(kname.localstr());
174 return m_runtime->notifyBeginLaunchKernel();
178#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
181 return m_runtime->notifyEndLaunchKernel();
185 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
186 if (global_cupti_flush > 0)
187 global_cupti_info.flush();
191 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
195 auto source_bytes = args.source().
bytes();
196 auto r = cudaMemcpyAsync(args.destination().
data(), source_bytes.data(),
197 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
198 ARCANE_CHECK_CUDA(r);
204 auto src = args.source().
bytes();
208 int device = cudaCpuDeviceId;
213 auto mem_location = _getMemoryLocation(device);
214#if defined(ARCANE_USING_CUDA13_OR_GREATER)
215 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
217 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
219 ARCANE_CHECK_CUDA(r);
230 cudaStream_t trueStream()
const
232 return m_cuda_stream;
238 cudaStream_t m_cuda_stream =
nullptr;
244class CudaRunQueueEvent
249 explicit CudaRunQueueEvent(
bool has_timer)
252 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
254 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
256 ~CudaRunQueueEvent()
override
258 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
267 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
272 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
278 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
281 Int64 elapsedTime(IRunQueueEventImpl* start_event)
final
285 auto* true_start_event =
static_cast<CudaRunQueueEvent*
>(start_event);
286 float time_in_ms = 0.0;
291 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
292 double x = time_in_ms * 1.0e6;
293 Int64 nano_time =
static_cast<Int64
>(x);
297 bool hasPendingWork()
final
299 cudaError_t v = cudaEventQuery(m_cuda_event);
300 if (v == cudaErrorNotReady)
302 ARCANE_CHECK_CUDA(v);
308 cudaEvent_t m_cuda_event;
323 void notifyBeginLaunchKernel()
override
325 ++m_nb_kernel_launched;
327 std::cout <<
"BEGIN CUDA KERNEL!\n";
329 void notifyEndLaunchKernel()
override
331 ARCANE_CHECK_CUDA(cudaGetLastError());
333 std::cout <<
"END CUDA KERNEL!\n";
335 void barrier()
override
337 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
357 auto v = buffer.
bytes();
358 const void* ptr = v.
data();
359 size_t count = v.size();
360 int device = device_id.
asInt32();
361 cudaMemoryAdvise cuda_advise;
364 cuda_advise = cudaMemAdviseSetReadMostly;
366 cuda_advise = cudaMemAdviseSetPreferredLocation;
368 cuda_advise = cudaMemAdviseSetAccessedBy;
370 cuda_advise = cudaMemAdviseSetPreferredLocation;
371 device = cudaCpuDeviceId;
374 cuda_advise = cudaMemAdviseSetAccessedBy;
375 device = cudaCpuDeviceId;
380 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
384 auto v = buffer.
bytes();
385 const void* ptr = v.
data();
386 size_t count = v.size();
387 int device = device_id.
asInt32();
388 cudaMemoryAdvise cuda_advise;
391 cuda_advise = cudaMemAdviseUnsetReadMostly;
393 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
395 cuda_advise = cudaMemAdviseUnsetAccessedBy;
397 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
398 device = cudaCpuDeviceId;
401 cuda_advise = cudaMemAdviseUnsetAccessedBy;
402 device = cudaCpuDeviceId;
406 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
409 void setCurrentDevice(
DeviceId device_id)
final
411 Int32
id = device_id.
asInt32();
413 ARCANE_FATAL(
"Device {0} is not an accelerator device",
id);
414 ARCANE_CHECK_CUDA(cudaSetDevice(
id));
417 const IDeviceInfoList* deviceInfoList()
final {
return &m_device_info_list; }
419 void startProfiling()
override
421 global_cupti_info.start();
424 void stopProfiling()
override
426 global_cupti_info.stop();
429 bool isProfilingActive()
override
431 return global_cupti_info.isActive();
434 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
436 cudaPointerAttributes ca;
437 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
441 _fillPointerAttribute(attribute, mem_type, ca.device,
442 ptr, ca.devicePointer, ca.hostPointer);
448 int wanted_d = device_id.
asInt32();
449 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
451 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
453 size_t total_mem = 0;
454 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
456 ARCANE_CHECK_CUDA(cudaSetDevice(d));
458 dmi.setFreeMemory(free_mem);
459 dmi.setTotalMemory(total_mem);
463 void pushProfilerRange(
const String& name, Int32 color_rgb)
override
465#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
466 if (color_rgb >= 0) {
469 nvtxEventAttributes_t eventAttrib;
470 std::memset(&eventAttrib, 0,
sizeof(nvtxEventAttributes_t));
471 eventAttrib.version = NVTX_VERSION;
472 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
473 eventAttrib.colorType = NVTX_COLOR_ARGB;
474 eventAttrib.color = color_rgb;
475 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
476 eventAttrib.message.ascii = name.
localstr();
477 nvtxRangePushEx(&eventAttrib);
483 void popProfilerRange()
override
485#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
492 finalizeCudaMemoryAllocators(tm);
496 const void* kernel_ptr,
497 Int64 total_loop_size,
498 Int32 wanted_shared_memory)
override
500 if (!m_use_computed_occupancy)
502 if (wanted_shared_memory < 0)
503 wanted_shared_memory = 0;
505 if (wanted_shared_memory != 0)
507 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
508 if (computed_block_size == 0)
510 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
511 int blocks_per_grid = CheckedConvert::toInt32(big_b);
512 return { blocks_per_grid, computed_block_size };
517 void fillDevices(
bool is_verbose);
521 m_use_computed_occupancy = v.value();
526 Int64 m_nb_kernel_launched = 0;
527 bool m_is_verbose =
false;
528 bool m_use_computed_occupancy =
false;
536void CudaRunnerRuntime::
537fillDevices(
bool is_verbose)
540 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
541 std::ostream& omain = std::cout;
543 omain <<
"ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device <<
"\n";
544 for (
int i = 0; i < nb_device; ++i) {
546 cudaGetDeviceProperties(&dp, i);
547 int runtime_version = 0;
548 cudaRuntimeGetVersion(&runtime_version);
549 int driver_version = 0;
550 cudaDriverGetVersion(&driver_version);
552 std::ostream& o = ostr.stream();
553 o <<
"Device " << i <<
" name=" << dp.name <<
"\n";
554 o <<
" Driver version = " << (driver_version / 1000) <<
"." << (driver_version % 1000) <<
"\n";
555 o <<
" Runtime version = " << (runtime_version / 1000) <<
"." << (runtime_version % 1000) <<
"\n";
556 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
557 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
558 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
559 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
560 o <<
" warpSize = " << dp.warpSize <<
"\n";
561 o <<
" memPitch = " << dp.memPitch <<
"\n";
562 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
563 o <<
" maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor <<
"\n";
564 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
565 o <<
" cooperativeLaunch = " << dp.cooperativeLaunch <<
"\n";
566 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
567 o <<
" integrated = " << dp.integrated <<
"\n";
568 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
569 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
570 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
571 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
572 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
573 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
574 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
575 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
576 <<
" " << dp.maxThreadsDim[2] <<
"\n";
577 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
578 <<
" " << dp.maxGridSize[2] <<
"\n";
579#if !defined(ARCANE_USING_CUDA13_OR_GREATER)
580 o <<
" clockRate = " << dp.clockRate <<
"\n";
581 o <<
" deviceOverlap = " << dp.deviceOverlap <<
"\n";
582 o <<
" computeMode = " << dp.computeMode <<
"\n";
583 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
588 int greatest_val = 0;
589 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
590 o <<
" leastPriority = " << least_val <<
" greatestPriority = " << greatest_val <<
"\n";
594 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
596 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
598 impl::printUUID(o, device_uuid.bytes);
601 String description(ostr.str());
603 omain << description;
606 device_info.setDescription(description);
607 device_info.setDeviceId(
DeviceId(i));
608 device_info.setName(dp.name);
609 device_info.setWarpSize(dp.warpSize);
610 m_device_info_list.addDevice(device_info);
613 Int32 global_cupti_level = 0;
617 global_cupti_level = v.value();
619 global_cupti_flush = v.value();
620 bool do_print_cupti =
true;
622 do_print_cupti = (v.value() != 0);
624 if (global_cupti_level > 0) {
625#ifndef ARCANE_HAS_CUDA_CUPTI
626 ARCANE_FATAL(
"Trying to enable CUPTI but Arcane is not compiled with cupti support");
628 global_cupti_info.init(global_cupti_level, do_print_cupti);
629 global_cupti_info.start();
650 ARCANE_CHECK_CUDA(cudaMemcpy(to.
data(), from.
data(), from.
bytes().
size(), cudaMemcpyDefault));
670extern "C" ARCANE_EXPORT
void
674 using namespace Arcane::Accelerator::Cuda;
675 global_cuda_runtime.build();
676 Arcane::Accelerator::impl::setUsingCUDARuntime(
true);
677 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
678 initializeCudaMemoryAllocators();
686 mrm->
setCopier(&global_cuda_memory_copier);
687 global_cuda_runtime.fillDevices(init_info.isVerbose());
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Map contenant l'occupation idéale pour un kernel donné.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un device.
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
static ARCCORE_BASE_EXPORT std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
Flot de sortie lié à une String.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')