14#include "arcane/accelerator/cuda/CudaAccelerator.h"
16#include "arcane/utils/PlatformUtils.h"
17#include "arcane/utils/Array.h"
18#include "arcane/utils/TraceInfo.h"
19#include "arcane/utils/NotSupportedException.h"
20#include "arcane/utils/FatalErrorException.h"
21#include "arcane/utils/NotImplementedException.h"
22#include "arcane/utils/IMemoryRessourceMng.h"
23#include "arcane/utils/MemoryView.h"
24#include "arcane/utils/OStringStream.h"
25#include "arcane/utils/ValueConvert.h"
26#include "arcane/utils/CheckedConvert.h"
27#include "arcane/utils/internal/MemoryUtilsInternal.h"
28#include "arcane/utils/internal/IMemoryRessourceMngInternal.h"
30#include "arcane/accelerator/core/RunQueueBuildInfo.h"
31#include "arcane/accelerator/core/Memory.h"
32#include "arcane/accelerator/core/DeviceInfoList.h"
33#include "arcane/accelerator/core/KernelLaunchArgs.h"
35#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
36#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
37#include "arcane/accelerator/core/internal/RunCommandImpl.h"
38#include "arcane/accelerator/core/internal/IRunQueueStream.h"
39#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
40#include "arcane/accelerator/core/PointerAttribute.h"
41#include "arcane/accelerator/core/RunQueue.h"
42#include "arcane/accelerator/core/DeviceMemoryInfo.h"
43#include "arcane/accelerator/core/NativeStream.h"
45#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
48#include <unordered_map>
53#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
54#include <nvtx3/nvToolsExt.h>
59namespace Arcane::Accelerator::Cuda
61using impl::KernelLaunchArgs;
65 Int32 global_cupti_flush = 0;
72void arcaneCheckCudaErrors(
const TraceInfo& ti, CUresult e)
74 if (e == CUDA_SUCCESS)
76 const char* error_name =
nullptr;
77 CUresult e2 = cuGetErrorName(e, &error_name);
78 if (e2 != CUDA_SUCCESS)
79 error_name =
"Unknown";
81 const char* error_message =
nullptr;
82 CUresult e3 = cuGetErrorString(e, &error_message);
83 if (e3 != CUDA_SUCCESS)
84 error_message =
"Unknown";
86 ARCANE_FATAL(
"CUDA Error trace={0} e={1} name={2} message={3}",
87 ti, e, error_name, error_message);
104 Int32 getNbThreadPerBlock(
const void* kernel_ptr)
106 std::scoped_lock lock(m_mutex);
107 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
108 if (x != m_nb_thread_per_block_map.end())
110 int min_grid_size = 0;
111 int computed_block_size = 0;
112 int wanted_shared_memory = 0;
113 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
114 if (r != cudaSuccess)
115 computed_block_size = 0;
117 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
119 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
121 cudaFuncAttributes func_attr;
122 cudaFuncGetAttributes(&func_attr, kernel_ptr);
123 const char* func_name =
nullptr;
124 cudaFuncGetName(&func_name, kernel_ptr);
125 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
126 std::cout <<
"ComputedBlockSize=" << computed_block_size <<
" n0=" << num_block_0 <<
" n1=" << num_block_1
127 <<
" min_grid_size=" << min_grid_size <<
" nb_reg=" << func_attr.numRegs
128 <<
" name=" << func_name <<
"\n";
129 return computed_block_size;
134 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
141class CudaRunQueueStream
150 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
152 int priority = bi.priority();
153 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
156 ~CudaRunQueueStream()
override
158 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
165#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
166 auto kname = c.kernelName();
168 nvtxRangePush(c.traceInfo().name());
170 nvtxRangePush(kname.localstr());
172 return m_runtime->notifyBeginLaunchKernel();
176#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
179 return m_runtime->notifyEndLaunchKernel();
183 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
184 if (global_cupti_flush > 0)
185 global_cupti_info.flush();
189 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
193 auto source_bytes = args.source().
bytes();
194 auto r = cudaMemcpyAsync(args.destination().
data(), source_bytes.data(),
195 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
196 ARCANE_CHECK_CUDA(r);
202 auto src = args.source().
bytes();
206 int device = cudaCpuDeviceId;
211 auto mem_location = _getMemoryLocation(device);
212#if defined(ARCANE_USING_CUDA13_OR_GREATER)
213 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
215 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
217 ARCANE_CHECK_CUDA(r);
228 cudaStream_t trueStream()
const
230 return m_cuda_stream;
236 cudaStream_t m_cuda_stream =
nullptr;
242class CudaRunQueueEvent
247 explicit CudaRunQueueEvent(
bool has_timer)
250 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
252 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
254 ~CudaRunQueueEvent()
override
256 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
265 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
270 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
276 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
279 Int64 elapsedTime(IRunQueueEventImpl* start_event)
final
283 auto* true_start_event =
static_cast<CudaRunQueueEvent*
>(start_event);
284 float time_in_ms = 0.0;
289 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
290 double x = time_in_ms * 1.0e6;
291 Int64 nano_time =
static_cast<Int64
>(x);
295 bool hasPendingWork()
final
297 cudaError_t v = cudaEventQuery(m_cuda_event);
298 if (v == cudaErrorNotReady)
300 ARCANE_CHECK_CUDA(v);
306 cudaEvent_t m_cuda_event;
321 void notifyBeginLaunchKernel()
override
323 ++m_nb_kernel_launched;
325 std::cout <<
"BEGIN CUDA KERNEL!\n";
327 void notifyEndLaunchKernel()
override
329 ARCANE_CHECK_CUDA(cudaGetLastError());
331 std::cout <<
"END CUDA KERNEL!\n";
333 void barrier()
override
335 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
355 auto v = buffer.
bytes();
356 const void* ptr = v.
data();
357 size_t count = v.size();
358 int device = device_id.
asInt32();
359 cudaMemoryAdvise cuda_advise;
362 cuda_advise = cudaMemAdviseSetReadMostly;
364 cuda_advise = cudaMemAdviseSetPreferredLocation;
366 cuda_advise = cudaMemAdviseSetAccessedBy;
368 cuda_advise = cudaMemAdviseSetPreferredLocation;
369 device = cudaCpuDeviceId;
372 cuda_advise = cudaMemAdviseSetAccessedBy;
373 device = cudaCpuDeviceId;
378 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
382 auto v = buffer.
bytes();
383 const void* ptr = v.
data();
384 size_t count = v.size();
385 int device = device_id.
asInt32();
386 cudaMemoryAdvise cuda_advise;
389 cuda_advise = cudaMemAdviseUnsetReadMostly;
391 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
393 cuda_advise = cudaMemAdviseUnsetAccessedBy;
395 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
396 device = cudaCpuDeviceId;
399 cuda_advise = cudaMemAdviseUnsetAccessedBy;
400 device = cudaCpuDeviceId;
404 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
407 void setCurrentDevice(
DeviceId device_id)
final
409 Int32
id = device_id.
asInt32();
411 ARCANE_FATAL(
"Device {0} is not an accelerator device",
id);
412 ARCANE_CHECK_CUDA(cudaSetDevice(
id));
415 const IDeviceInfoList* deviceInfoList()
final {
return &m_device_info_list; }
417 void startProfiling()
override
419 global_cupti_info.start();
422 void stopProfiling()
override
424 global_cupti_info.stop();
427 bool isProfilingActive()
override
429 return global_cupti_info.isActive();
432 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
434 cudaPointerAttributes ca;
435 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
439 _fillPointerAttribute(attribute, mem_type, ca.device,
440 ptr, ca.devicePointer, ca.hostPointer);
446 int wanted_d = device_id.
asInt32();
447 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
449 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
451 size_t total_mem = 0;
452 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
454 ARCANE_CHECK_CUDA(cudaSetDevice(d));
456 dmi.setFreeMemory(free_mem);
457 dmi.setTotalMemory(total_mem);
461 void pushProfilerRange(
const String& name, Int32 color_rgb)
override
463#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
464 if (color_rgb >= 0) {
467 nvtxEventAttributes_t eventAttrib;
468 std::memset(&eventAttrib, 0,
sizeof(nvtxEventAttributes_t));
469 eventAttrib.version = NVTX_VERSION;
470 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
471 eventAttrib.colorType = NVTX_COLOR_ARGB;
472 eventAttrib.color = color_rgb;
473 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
474 eventAttrib.message.ascii = name.
localstr();
475 nvtxRangePushEx(&eventAttrib);
481 void popProfilerRange()
override
483#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
490 finalizeCudaMemoryAllocators(tm);
494 const void* kernel_ptr,
495 Int64 total_loop_size,
496 Int32 wanted_shared_memory)
override
498 if (!m_use_computed_occupancy)
500 if (wanted_shared_memory < 0)
501 wanted_shared_memory = 0;
503 if (wanted_shared_memory != 0)
505 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
506 if (computed_block_size == 0)
508 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
510 return { blocks_per_grid, computed_block_size };
515 void fillDevices(
bool is_verbose);
519 m_use_computed_occupancy = v.value();
524 Int64 m_nb_kernel_launched = 0;
525 bool m_is_verbose =
false;
526 bool m_use_computed_occupancy =
false;
534void CudaRunnerRuntime::
535fillDevices(
bool is_verbose)
538 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
539 std::ostream& omain = std::cout;
541 omain <<
"ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device <<
"\n";
542 for (
int i = 0; i < nb_device; ++i) {
544 cudaGetDeviceProperties(&dp, i);
545 int runtime_version = 0;
546 cudaRuntimeGetVersion(&runtime_version);
547 int driver_version = 0;
548 cudaDriverGetVersion(&driver_version);
550 std::ostream& o = ostr.stream();
551 o <<
"Device " << i <<
" name=" << dp.name <<
"\n";
552 o <<
" Driver version = " << (driver_version / 1000) <<
"." << (driver_version % 1000) <<
"\n";
553 o <<
" Runtime version = " << (runtime_version / 1000) <<
"." << (runtime_version % 1000) <<
"\n";
554 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
555 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
556 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
557 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
558 o <<
" warpSize = " << dp.warpSize <<
"\n";
559 o <<
" memPitch = " << dp.memPitch <<
"\n";
560 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
561 o <<
" maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor <<
"\n";
562 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
563 o <<
" cooperativeLaunch = " << dp.cooperativeLaunch <<
"\n";
564 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
565 o <<
" integrated = " << dp.integrated <<
"\n";
566 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
567 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
568 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
569 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
570 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
571 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
572 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
573 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
574 <<
" " << dp.maxThreadsDim[2] <<
"\n";
575 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
576 <<
" " << dp.maxGridSize[2] <<
"\n";
577#if !defined(ARCANE_USING_CUDA13_OR_GREATER)
578 o <<
" clockRate = " << dp.clockRate <<
"\n";
579 o <<
" deviceOverlap = " << dp.deviceOverlap <<
"\n";
580 o <<
" computeMode = " << dp.computeMode <<
"\n";
581 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
586 int greatest_val = 0;
587 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
588 o <<
" leastPriority = " << least_val <<
" greatestPriority = " << greatest_val <<
"\n";
592 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
594 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
596 impl::printUUID(o, device_uuid.bytes);
599 String description(ostr.str());
601 omain << description;
604 device_info.setDescription(description);
605 device_info.setDeviceId(
DeviceId(i));
606 device_info.setName(dp.name);
607 device_info.setWarpSize(dp.warpSize);
608 m_device_info_list.addDevice(device_info);
611 Int32 global_cupti_level = 0;
615 global_cupti_level = v.value();
617 global_cupti_flush = v.value();
618 bool do_print_cupti =
true;
620 do_print_cupti = (v.value() != 0);
622 if (global_cupti_level > 0) {
623#ifndef ARCANE_HAS_CUDA_CUPTI
624 ARCANE_FATAL(
"Trying to enable CUPTI but Arcane is not compiled with cupti support");
626 global_cupti_info.init(global_cupti_level, do_print_cupti);
627 global_cupti_info.start();
648 ARCANE_CHECK_CUDA(cudaMemcpy(to.
data(), from.
data(), from.
bytes().
size(), cudaMemcpyDefault));
668extern "C" ARCANE_EXPORT
void
672 using namespace Arcane::Accelerator::Cuda;
673 global_cuda_runtime.build();
674 Arcane::Accelerator::impl::setUsingCUDARuntime(
true);
675 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
676 initializeCudaMemoryAllocators();
684 mrm->
setCopier(&global_cuda_memory_copier);
685 global_cuda_runtime.fillDevices(init_info.isVerbose());
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Map contenant l'occupation idéale pour un kernel donné.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un device.
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
static std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Interface pour les copies mémoire avec support des accélérateurs.
virtual IMemoryRessourceMngInternal * _internal()=0
Interface interne.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryRessource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
Flot de sortie lié à une String.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
Int32 toInt32(Int64 v)
Converti un Int64 en un Int32.
IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')