14#include "arcane/accelerator/cuda/CudaAccelerator.h"
16#include "arcane/utils/PlatformUtils.h"
17#include "arcane/utils/Array.h"
18#include "arcane/utils/TraceInfo.h"
19#include "arcane/utils/NotSupportedException.h"
20#include "arcane/utils/FatalErrorException.h"
21#include "arcane/utils/NotImplementedException.h"
22#include "arcane/utils/IMemoryRessourceMng.h"
23#include "arcane/utils/MemoryView.h"
24#include "arcane/utils/OStringStream.h"
25#include "arcane/utils/ValueConvert.h"
26#include "arcane/utils/internal/IMemoryRessourceMngInternal.h"
28#include "arcane/accelerator/core/RunQueueBuildInfo.h"
29#include "arcane/accelerator/core/Memory.h"
30#include "arcane/accelerator/core/DeviceInfoList.h"
32#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
33#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
34#include "arcane/accelerator/core/internal/RunCommandImpl.h"
35#include "arcane/accelerator/core/internal/IRunQueueStream.h"
36#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
37#include "arcane/accelerator/core/PointerAttribute.h"
38#include "arcane/accelerator/core/RunQueue.h"
39#include "arcane/accelerator/core/DeviceMemoryInfo.h"
40#include "arcane/accelerator/core/NativeStream.h"
42#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
48#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
49#include <nvtx3/nvToolsExt.h>
54namespace Arcane::Accelerator::Cuda
58 Int32 global_cupti_flush = 0;
65void arcaneCheckCudaErrors(
const TraceInfo& ti, CUresult e)
67 if (e == CUDA_SUCCESS)
69 const char* error_name =
nullptr;
70 CUresult e2 = cuGetErrorName(e, &error_name);
71 if (e2 != CUDA_SUCCESS)
72 error_name =
"Unknown";
74 const char* error_message =
nullptr;
75 CUresult e3 = cuGetErrorString(e, &error_message);
76 if (e3 != CUDA_SUCCESS)
77 error_message =
"Unknown";
79 ARCANE_FATAL(
"CUDA Error trace={0} e={1} name={2} message={3}",
80 ti, e, error_name, error_message);
86class CudaRunQueueStream
95 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
97 int priority = bi.priority();
98 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
101 ~CudaRunQueueStream()
override
103 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
110#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
111 auto kname = c.kernelName();
113 nvtxRangePush(c.traceInfo().name());
115 nvtxRangePush(kname.localstr());
117 return m_runtime->notifyBeginLaunchKernel();
121#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
124 return m_runtime->notifyEndLaunchKernel();
128 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
129 if (global_cupti_flush > 0)
130 global_cupti_info.flush();
134 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
138 auto source_bytes = args.source().bytes();
139 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
140 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
141 ARCANE_CHECK_CUDA(r);
147 auto src = args.source().bytes();
151 int device = cudaCpuDeviceId;
156 auto r = cudaMemPrefetchAsync(src.data(), src.size(), device, m_cuda_stream);
157 ARCANE_CHECK_CUDA(r);
168 cudaStream_t trueStream()
const
170 return m_cuda_stream;
176 cudaStream_t m_cuda_stream;
182class CudaRunQueueEvent
187 explicit CudaRunQueueEvent(
bool has_timer)
190 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
192 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
194 ~CudaRunQueueEvent()
override
196 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
205 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
210 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
216 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
219 Int64 elapsedTime(IRunQueueEventImpl* start_event)
final
223 auto* true_start_event =
static_cast<CudaRunQueueEvent*
>(start_event);
224 float time_in_ms = 0.0;
229 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
230 double x = time_in_ms * 1.0e6;
231 Int64 nano_time =
static_cast<Int64
>(x);
235 bool hasPendingWork()
final
237 cudaError_t v = cudaEventQuery(m_cuda_event);
238 if (v == cudaErrorNotReady)
240 ARCANE_CHECK_CUDA(v);
246 cudaEvent_t m_cuda_event;
261 void notifyBeginLaunchKernel()
override
263 ++m_nb_kernel_launched;
265 std::cout <<
"BEGIN CUDA KERNEL!\n";
267 void notifyEndLaunchKernel()
override
269 ARCANE_CHECK_CUDA(cudaGetLastError());
271 std::cout <<
"END CUDA KERNEL!\n";
273 void barrier()
override
275 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
295 auto v = buffer.bytes();
296 const void* ptr = v.data();
297 size_t count = v.size();
298 int device = device_id.
asInt32();
299 cudaMemoryAdvise cuda_advise;
302 cuda_advise = cudaMemAdviseSetReadMostly;
304 cuda_advise = cudaMemAdviseSetPreferredLocation;
306 cuda_advise = cudaMemAdviseSetAccessedBy;
308 cuda_advise = cudaMemAdviseSetPreferredLocation;
309 device = cudaCpuDeviceId;
312 cuda_advise = cudaMemAdviseSetAccessedBy;
313 device = cudaCpuDeviceId;
318 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, device));
322 auto v = buffer.bytes();
323 const void* ptr = v.data();
324 size_t count = v.size();
325 int device = device_id.
asInt32();
326 cudaMemoryAdvise cuda_advise;
329 cuda_advise = cudaMemAdviseUnsetReadMostly;
331 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
333 cuda_advise = cudaMemAdviseUnsetAccessedBy;
335 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
336 device = cudaCpuDeviceId;
339 cuda_advise = cudaMemAdviseUnsetAccessedBy;
340 device = cudaCpuDeviceId;
344 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, device));
347 void setCurrentDevice(
DeviceId device_id)
final
349 Int32
id = device_id.
asInt32();
351 ARCANE_FATAL(
"Device {0} is not an accelerator device",
id);
352 ARCANE_CHECK_CUDA(cudaSetDevice(
id));
355 const IDeviceInfoList* deviceInfoList()
final {
return &m_device_info_list; }
357 void startProfiling()
override
359 global_cupti_info.start();
362 void stopProfiling()
override
364 global_cupti_info.stop();
367 bool isProfilingActive()
override
369 return global_cupti_info.isActive();
372 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
374 cudaPointerAttributes ca;
375 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
379 _fillPointerAttribute(attribute, mem_type, ca.device,
380 ptr, ca.devicePointer, ca.hostPointer);
386 int wanted_d = device_id.
asInt32();
387 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
389 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
391 size_t total_mem = 0;
392 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
394 ARCANE_CHECK_CUDA(cudaSetDevice(d));
396 dmi.setFreeMemory(free_mem);
397 dmi.setTotalMemory(total_mem);
401 void pushProfilerRange(
const String& name, Int32 color_rgb)
override
403#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
404 if (color_rgb >= 0) {
407 nvtxEventAttributes_t eventAttrib;
408 std::memset(&eventAttrib, 0,
sizeof(nvtxEventAttributes_t));
409 eventAttrib.version = NVTX_VERSION;
410 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
411 eventAttrib.colorType = NVTX_COLOR_ARGB;
412 eventAttrib.color = color_rgb;
413 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
414 eventAttrib.message.ascii = name.
localstr();
415 nvtxRangePushEx(&eventAttrib);
421 void popProfilerRange()
override
423#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
430 finalizeCudaMemoryAllocators(tm);
435 void fillDevices(
bool is_verbose);
439 Int64 m_nb_kernel_launched = 0;
440 bool m_is_verbose =
false;
447void CudaRunnerRuntime::
448fillDevices(
bool is_verbose)
451 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
452 std::ostream& omain = std::cout;
454 omain <<
"ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device <<
"\n";
455 for (
int i = 0; i < nb_device; ++i) {
457 cudaGetDeviceProperties(&dp, i);
459 std::ostream& o = ostr.stream();
460 o <<
"Device " << i <<
" name=" << dp.name <<
"\n";
461 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
462 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
463 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
464 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
465 o <<
" warpSize = " << dp.warpSize <<
"\n";
466 o <<
" memPitch = " << dp.memPitch <<
"\n";
467 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
468 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
469 o <<
" clockRate = " << dp.clockRate <<
"\n";
470 o <<
" deviceOverlap = " << dp.deviceOverlap <<
"\n";
471 o <<
" cooperativeLaunch = " << dp.cooperativeLaunch <<
"\n";
472 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
473 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
474 o <<
" integrated = " << dp.integrated <<
"\n";
475 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
476 o <<
" computeMode = " << dp.computeMode <<
"\n";
477 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
478 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
479 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
480 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
481 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
482 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
483 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
484 <<
" " << dp.maxThreadsDim[2] <<
"\n";
485 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
486 <<
" " << dp.maxGridSize[2] <<
"\n";
489 int greatest_val = 0;
490 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
491 o <<
" leastPriority = " << least_val <<
" greatestPriority = " << greatest_val <<
"\n";
495 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
497 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
499 impl::printUUID(o, device_uuid.bytes);
502 String description(ostr.str());
504 omain << description;
507 device_info.setDescription(description);
508 device_info.setDeviceId(
DeviceId(i));
509 device_info.setName(dp.name);
510 m_device_info_list.addDevice(device_info);
513 Int32 global_cupti_level = 0;
517 global_cupti_level = v.value();
519 global_cupti_flush = v.value();
520 bool do_print_cupti =
true;
522 do_print_cupti = (v.value() != 0);
524 if (global_cupti_level > 0) {
525#ifndef ARCANE_HAS_CUDA_CUPTI
526 ARCANE_FATAL(
"Trying to enable CUPTI but Arcane is not compiled with cupti support");
528 global_cupti_info.init(global_cupti_level, do_print_cupti);
529 global_cupti_info.start();
550 ARCANE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
570extern "C" ARCANE_EXPORT
void
574 using namespace Arcane::Accelerator::Cuda;
575 Arcane::Accelerator::impl::setUsingCUDARuntime(
true);
576 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
577 initializeCudaMemoryAllocators();
584 mrm->
setCopier(&global_cuda_memory_copier);
585 global_cuda_runtime.fillDevices(init_info.isVerbose());
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un device.
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
static std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Interface pour les copies mémoire avec support des accélérateurs.
virtual IMemoryRessourceMngInternal * _internal()=0
Interface interne.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryRessource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
Interface du gestionnaire de traces.
Flot de sortie lié à une String.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')