14#include "arcane/accelerator/cuda/CudaAccelerator.h"
16#include "arcane/utils/PlatformUtils.h"
17#include "arcane/utils/Array.h"
18#include "arcane/utils/TraceInfo.h"
19#include "arcane/utils/NotSupportedException.h"
20#include "arcane/utils/FatalErrorException.h"
21#include "arcane/utils/NotImplementedException.h"
22#include "arcane/utils/IMemoryRessourceMng.h"
23#include "arcane/utils/MemoryView.h"
24#include "arcane/utils/OStringStream.h"
25#include "arcane/utils/ValueConvert.h"
26#include "arcane/utils/CheckedConvert.h"
27#include "arcane/utils/internal/IMemoryRessourceMngInternal.h"
29#include "arcane/accelerator/core/RunQueueBuildInfo.h"
30#include "arcane/accelerator/core/Memory.h"
31#include "arcane/accelerator/core/DeviceInfoList.h"
32#include "arcane/accelerator/core/KernelLaunchArgs.h"
34#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
35#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
36#include "arcane/accelerator/core/internal/RunCommandImpl.h"
37#include "arcane/accelerator/core/internal/IRunQueueStream.h"
38#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
39#include "arcane/accelerator/core/PointerAttribute.h"
40#include "arcane/accelerator/core/RunQueue.h"
41#include "arcane/accelerator/core/DeviceMemoryInfo.h"
42#include "arcane/accelerator/core/NativeStream.h"
44#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
47#include <unordered_map>
52#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
53#include <nvtx3/nvToolsExt.h>
58namespace Arcane::Accelerator::Cuda
60using impl::KernelLaunchArgs;
64 Int32 global_cupti_flush = 0;
71void arcaneCheckCudaErrors(
const TraceInfo& ti, CUresult e)
73 if (e == CUDA_SUCCESS)
75 const char* error_name =
nullptr;
76 CUresult e2 = cuGetErrorName(e, &error_name);
77 if (e2 != CUDA_SUCCESS)
78 error_name =
"Unknown";
80 const char* error_message =
nullptr;
81 CUresult e3 = cuGetErrorString(e, &error_message);
82 if (e3 != CUDA_SUCCESS)
83 error_message =
"Unknown";
85 ARCANE_FATAL(
"CUDA Error trace={0} e={1} name={2} message={3}",
86 ti, e, error_name, error_message);
103 Int32 getNbThreadPerBlock(
const void* kernel_ptr)
105 std::scoped_lock lock(m_mutex);
106 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
107 if (x != m_nb_thread_per_block_map.end())
109 int min_grid_size = 0;
110 int computed_block_size = 0;
111 int wanted_shared_memory = 0;
112 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
113 if (r != cudaSuccess)
114 computed_block_size = 0;
116 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
118 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
120 cudaFuncAttributes func_attr;
121 cudaFuncGetAttributes(&func_attr, kernel_ptr);
122 const char* func_name =
nullptr;
123 cudaFuncGetName(&func_name, kernel_ptr);
124 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
125 std::cout <<
"ComputedBlockSize=" << computed_block_size <<
" n0=" << num_block_0 <<
" n1=" << num_block_1
126 <<
" min_grid_size=" << min_grid_size <<
" nb_reg=" << func_attr.numRegs
127 <<
" name=" << func_name <<
"\n";
128 return computed_block_size;
133 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
140class CudaRunQueueStream
149 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
151 int priority = bi.priority();
152 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
155 ~CudaRunQueueStream()
override
157 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
164#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
165 auto kname = c.kernelName();
167 nvtxRangePush(c.traceInfo().name());
169 nvtxRangePush(kname.localstr());
171 return m_runtime->notifyBeginLaunchKernel();
175#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
178 return m_runtime->notifyEndLaunchKernel();
182 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
183 if (global_cupti_flush > 0)
184 global_cupti_info.flush();
188 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
192 auto source_bytes = args.source().
bytes();
193 auto r = cudaMemcpyAsync(args.destination().
data(), source_bytes.data(),
194 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
195 ARCANE_CHECK_CUDA(r);
201 auto src = args.source().
bytes();
205 int device = cudaCpuDeviceId;
210 auto mem_location = _getMemoryLocation(device);
211#if defined(ARCANE_USING_CUDA13_OR_GREATER)
212 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
214 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
216 ARCANE_CHECK_CUDA(r);
227 cudaStream_t trueStream()
const
229 return m_cuda_stream;
235 cudaStream_t m_cuda_stream =
nullptr;
241class CudaRunQueueEvent
246 explicit CudaRunQueueEvent(
bool has_timer)
249 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
251 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
253 ~CudaRunQueueEvent()
override
255 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
264 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
269 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
275 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
278 Int64 elapsedTime(IRunQueueEventImpl* start_event)
final
282 auto* true_start_event =
static_cast<CudaRunQueueEvent*
>(start_event);
283 float time_in_ms = 0.0;
288 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
289 double x = time_in_ms * 1.0e6;
290 Int64 nano_time =
static_cast<Int64
>(x);
294 bool hasPendingWork()
final
296 cudaError_t v = cudaEventQuery(m_cuda_event);
297 if (v == cudaErrorNotReady)
299 ARCANE_CHECK_CUDA(v);
305 cudaEvent_t m_cuda_event;
320 void notifyBeginLaunchKernel()
override
322 ++m_nb_kernel_launched;
324 std::cout <<
"BEGIN CUDA KERNEL!\n";
326 void notifyEndLaunchKernel()
override
328 ARCANE_CHECK_CUDA(cudaGetLastError());
330 std::cout <<
"END CUDA KERNEL!\n";
332 void barrier()
override
334 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
354 auto v = buffer.
bytes();
355 const void* ptr = v.
data();
356 size_t count = v.size();
357 int device = device_id.
asInt32();
358 cudaMemoryAdvise cuda_advise;
361 cuda_advise = cudaMemAdviseSetReadMostly;
363 cuda_advise = cudaMemAdviseSetPreferredLocation;
365 cuda_advise = cudaMemAdviseSetAccessedBy;
367 cuda_advise = cudaMemAdviseSetPreferredLocation;
368 device = cudaCpuDeviceId;
371 cuda_advise = cudaMemAdviseSetAccessedBy;
372 device = cudaCpuDeviceId;
377 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
381 auto v = buffer.
bytes();
382 const void* ptr = v.
data();
383 size_t count = v.size();
384 int device = device_id.
asInt32();
385 cudaMemoryAdvise cuda_advise;
388 cuda_advise = cudaMemAdviseUnsetReadMostly;
390 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
392 cuda_advise = cudaMemAdviseUnsetAccessedBy;
394 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
395 device = cudaCpuDeviceId;
398 cuda_advise = cudaMemAdviseUnsetAccessedBy;
399 device = cudaCpuDeviceId;
403 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
406 void setCurrentDevice(
DeviceId device_id)
final
408 Int32
id = device_id.
asInt32();
410 ARCANE_FATAL(
"Device {0} is not an accelerator device",
id);
411 ARCANE_CHECK_CUDA(cudaSetDevice(
id));
414 const IDeviceInfoList* deviceInfoList()
final {
return &m_device_info_list; }
416 void startProfiling()
override
418 global_cupti_info.start();
421 void stopProfiling()
override
423 global_cupti_info.stop();
426 bool isProfilingActive()
override
428 return global_cupti_info.isActive();
431 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
433 cudaPointerAttributes ca;
434 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
438 _fillPointerAttribute(attribute, mem_type, ca.device,
439 ptr, ca.devicePointer, ca.hostPointer);
445 int wanted_d = device_id.
asInt32();
446 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
448 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
450 size_t total_mem = 0;
451 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
453 ARCANE_CHECK_CUDA(cudaSetDevice(d));
455 dmi.setFreeMemory(free_mem);
456 dmi.setTotalMemory(total_mem);
460 void pushProfilerRange(
const String& name, Int32 color_rgb)
override
462#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
463 if (color_rgb >= 0) {
466 nvtxEventAttributes_t eventAttrib;
467 std::memset(&eventAttrib, 0,
sizeof(nvtxEventAttributes_t));
468 eventAttrib.version = NVTX_VERSION;
469 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
470 eventAttrib.colorType = NVTX_COLOR_ARGB;
471 eventAttrib.color = color_rgb;
472 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
473 eventAttrib.message.ascii = name.
localstr();
474 nvtxRangePushEx(&eventAttrib);
480 void popProfilerRange()
override
482#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
489 finalizeCudaMemoryAllocators(tm);
493 const void* kernel_ptr,
494 Int64 total_loop_size,
495 Int32 wanted_shared_memory)
override
497 if (!m_use_computed_occupancy)
499 if (wanted_shared_memory < 0)
500 wanted_shared_memory = 0;
502 if (wanted_shared_memory != 0)
504 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
505 if (computed_block_size == 0)
507 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
509 return { blocks_per_grid, computed_block_size };
514 void fillDevices(
bool is_verbose);
518 m_use_computed_occupancy = v.value();
523 Int64 m_nb_kernel_launched = 0;
524 bool m_is_verbose =
false;
525 bool m_use_computed_occupancy =
false;
533void CudaRunnerRuntime::
534fillDevices(
bool is_verbose)
537 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
538 std::ostream& omain = std::cout;
540 omain <<
"ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device <<
"\n";
541 for (
int i = 0; i < nb_device; ++i) {
543 cudaGetDeviceProperties(&dp, i);
544 int runtime_version = 0;
545 cudaRuntimeGetVersion(&runtime_version);
546 int driver_version = 0;
547 cudaDriverGetVersion(&driver_version);
549 std::ostream& o = ostr.stream();
550 o <<
"Device " << i <<
" name=" << dp.name <<
"\n";
551 o <<
" Driver version = " << (driver_version / 1000) <<
"." << (driver_version % 1000) <<
"\n";
552 o <<
" Runtime version = " << (runtime_version / 1000) <<
"." << (runtime_version % 1000) <<
"\n";
553 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
554 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
555 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
556 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
557 o <<
" warpSize = " << dp.warpSize <<
"\n";
558 o <<
" memPitch = " << dp.memPitch <<
"\n";
559 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
560 o <<
" maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor <<
"\n";
561 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
562 o <<
" cooperativeLaunch = " << dp.cooperativeLaunch <<
"\n";
563 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
564 o <<
" integrated = " << dp.integrated <<
"\n";
565 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
566 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
567 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
568 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
569 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
570 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
571 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
572 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
573 <<
" " << dp.maxThreadsDim[2] <<
"\n";
574 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
575 <<
" " << dp.maxGridSize[2] <<
"\n";
576#if !defined(ARCANE_USING_CUDA13_OR_GREATER)
577 o <<
" clockRate = " << dp.clockRate <<
"\n";
578 o <<
" deviceOverlap = " << dp.deviceOverlap <<
"\n";
579 o <<
" computeMode = " << dp.computeMode <<
"\n";
580 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
585 int greatest_val = 0;
586 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
587 o <<
" leastPriority = " << least_val <<
" greatestPriority = " << greatest_val <<
"\n";
591 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
593 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
595 impl::printUUID(o, device_uuid.bytes);
598 String description(ostr.str());
600 omain << description;
603 device_info.setDescription(description);
604 device_info.setDeviceId(
DeviceId(i));
605 device_info.setName(dp.name);
606 m_device_info_list.addDevice(device_info);
609 Int32 global_cupti_level = 0;
613 global_cupti_level = v.value();
615 global_cupti_flush = v.value();
616 bool do_print_cupti =
true;
618 do_print_cupti = (v.value() != 0);
620 if (global_cupti_level > 0) {
621#ifndef ARCANE_HAS_CUDA_CUPTI
622 ARCANE_FATAL(
"Trying to enable CUPTI but Arcane is not compiled with cupti support");
624 global_cupti_info.init(global_cupti_level, do_print_cupti);
625 global_cupti_info.start();
646 ARCANE_CHECK_CUDA(cudaMemcpy(to.
data(), from.
data(), from.
bytes().
size(), cudaMemcpyDefault));
666extern "C" ARCANE_EXPORT
void
670 using namespace Arcane::Accelerator::Cuda;
671 global_cuda_runtime.build();
672 Arcane::Accelerator::impl::setUsingCUDARuntime(
true);
673 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
674 initializeCudaMemoryAllocators();
681 mrm->
setCopier(&global_cuda_memory_copier);
682 global_cuda_runtime.fillDevices(init_info.isVerbose());
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Map contenant l'occupation idéale pour un kernel donné.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un device.
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
static std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Interface pour les copies mémoire avec support des accélérateurs.
virtual IMemoryRessourceMngInternal * _internal()=0
Interface interne.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryRessource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
Flot de sortie lié à une String.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
Int32 toInt32(Int64 v)
Converti un Int64 en un Int32.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')