Arcane  v3.14.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2024 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/accelerator/cuda/CudaAccelerator.h"
15
16#include "arcane/utils/PlatformUtils.h"
17#include "arcane/utils/Array.h"
18#include "arcane/utils/TraceInfo.h"
19#include "arcane/utils/NotSupportedException.h"
20#include "arcane/utils/FatalErrorException.h"
21#include "arcane/utils/NotImplementedException.h"
22#include "arcane/utils/IMemoryRessourceMng.h"
23#include "arcane/utils/MemoryView.h"
24#include "arcane/utils/OStringStream.h"
25#include "arcane/utils/ValueConvert.h"
26#include "arcane/utils/internal/IMemoryRessourceMngInternal.h"
27
28#include "arcane/accelerator/core/RunQueueBuildInfo.h"
29#include "arcane/accelerator/core/Memory.h"
30#include "arcane/accelerator/core/DeviceInfoList.h"
31
32#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
33#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
34#include "arcane/accelerator/core/internal/RunCommandImpl.h"
35#include "arcane/accelerator/core/internal/IRunQueueStream.h"
36#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
37#include "arcane/accelerator/core/PointerAttribute.h"
38#include "arcane/accelerator/core/RunQueue.h"
39#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
40
41#include <iostream>
42
43#include <cuda.h>
44
45#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
46#include <nvtx3/nvToolsExt.h>
47#endif
48
49using namespace Arccore;
50
51namespace Arcane::Accelerator::Cuda
52{
53namespace
54{
55 Int32 global_cupti_flush = 0;
56 CuptiInfo global_cupti_info;
57} // namespace
58
59/*---------------------------------------------------------------------------*/
60/*---------------------------------------------------------------------------*/
61
62void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
63{
64 if (e == CUDA_SUCCESS)
65 return;
66 const char* error_name = nullptr;
67 CUresult e2 = cuGetErrorName(e, &error_name);
68 if (e2 != CUDA_SUCCESS)
69 error_name = "Unknown";
70
71 const char* error_message = nullptr;
72 CUresult e3 = cuGetErrorString(e, &error_message);
73 if (e3 != CUDA_SUCCESS)
74 error_message = "Unknown";
75
76 ARCANE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
77 ti, e, error_name, error_message);
78}
79
80/*---------------------------------------------------------------------------*/
81/*---------------------------------------------------------------------------*/
82
83void _printUUID(std::ostream& o, char bytes[16])
84{
85 static const char hexa_chars[16] = { '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'a', 'b', 'c', 'd', 'e', 'f' };
86
87 for (int i = 0; i < 16; ++i) {
88 o << hexa_chars[(bytes[i] >> 4) & 0xf];
89 o << hexa_chars[bytes[i] & 0xf];
90 if (i == 4 || i == 6 || i == 8 || i == 10)
91 o << '-';
92 }
93}
94
95/*---------------------------------------------------------------------------*/
96/*---------------------------------------------------------------------------*/
97
100{
101 public:
102
104 : m_runtime(runtime)
105 {
106 if (bi.isDefault())
107 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
108 else {
109 int priority = bi.priority();
110 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
111 }
112 }
113 ~CudaRunQueueStream() override
114 {
115 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
116 }
117
118 public:
119
121 {
122#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
123 auto kname = c.kernelName();
124 if (kname.empty())
125 nvtxRangePush(c.traceInfo().name());
126 else
127 nvtxRangePush(kname.localstr());
128#endif
129 return m_runtime->notifyBeginLaunchKernel();
130 }
132 {
133#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
134 nvtxRangePop();
135#endif
136 return m_runtime->notifyEndLaunchKernel();
137 }
138 void barrier() override
139 {
140 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
141 if (global_cupti_flush > 0)
142 global_cupti_info.flush();
143 }
144 bool _barrierNoException() override
145 {
146 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
147 }
148 void copyMemory(const MemoryCopyArgs& args) override
149 {
150 auto source_bytes = args.source().bytes();
151 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
152 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
153 ARCANE_CHECK_CUDA(r);
154 if (!args.isAsync())
155 barrier();
156 }
157 void prefetchMemory(const MemoryPrefetchArgs& args) override
158 {
159 auto src = args.source().bytes();
160 if (src.size() == 0)
161 return;
162 DeviceId d = args.deviceId();
163 int device = cudaCpuDeviceId;
164 if (!d.isHost())
165 device = d.asInt32();
166 //std::cout << "PREFETCH device=" << device << " host=" << cudaCpuDeviceId << " size=" << args.source().length() << "\n";
167 auto r = cudaMemPrefetchAsync(src.data(), src.size(), device, m_cuda_stream);
168 ARCANE_CHECK_CUDA(r);
169 if (!args.isAsync())
170 barrier();
171 }
172 void* _internalImpl() override
173 {
174 return &m_cuda_stream;
175 }
176
177 public:
178
179 cudaStream_t trueStream() const
180 {
181 return m_cuda_stream;
182 }
183
184 private:
185
186 impl::IRunnerRuntime* m_runtime;
187 cudaStream_t m_cuda_stream;
188};
189
190/*---------------------------------------------------------------------------*/
191/*---------------------------------------------------------------------------*/
192
195{
196 public:
197
198 explicit CudaRunQueueEvent(bool has_timer)
199 {
200 if (has_timer)
201 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
202 else
203 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
204 }
205 ~CudaRunQueueEvent() override
206 {
207 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
208 }
209
210 public:
211
212 // Enregistre l'événement au sein d'une RunQueue
213 void recordQueue(impl::IRunQueueStream* stream) final
214 {
215 auto* rq = static_cast<CudaRunQueueStream*>(stream);
216 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
217 }
218
219 void wait() final
220 {
221 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
222 }
223
224 void waitForEvent(impl::IRunQueueStream* stream) final
225 {
226 auto* rq = static_cast<CudaRunQueueStream*>(stream);
227 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
228 }
229
230 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
231 {
232 // NOTE: Les évènements doivent avoir été créé avec le timer actif
234 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
235 float time_in_ms = 0.0;
236
237 // TODO: regarder si nécessaire
238 // ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
239
240 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
241 double x = time_in_ms * 1.0e6;
242 Int64 nano_time = static_cast<Int64>(x);
243 return nano_time;
244 }
245
246 private:
247
248 cudaEvent_t m_cuda_event;
249};
250
251/*---------------------------------------------------------------------------*/
252/*---------------------------------------------------------------------------*/
253
256{
257 public:
258
259 ~CudaRunnerRuntime() override = default;
260
261 public:
262
263 void notifyBeginLaunchKernel() override
264 {
265 ++m_nb_kernel_launched;
266 if (m_is_verbose)
267 std::cout << "BEGIN CUDA KERNEL!\n";
268 }
269 void notifyEndLaunchKernel() override
270 {
271 ARCANE_CHECK_CUDA(cudaGetLastError());
272 if (m_is_verbose)
273 std::cout << "END CUDA KERNEL!\n";
274 }
275 void barrier() override
276 {
277 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
278 }
279 eExecutionPolicy executionPolicy() const override
280 {
282 }
283 impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
284 {
285 return new CudaRunQueueStream(this, bi);
286 }
287 impl::IRunQueueEventImpl* createEventImpl() override
288 {
289 return new CudaRunQueueEvent(false);
290 }
291 impl::IRunQueueEventImpl* createEventImplWithTimer() override
292 {
293 return new CudaRunQueueEvent(true);
294 }
295 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
296 {
297 auto v = buffer.bytes();
298 const void* ptr = v.data();
299 size_t count = v.size();
300 int device = device_id.asInt32();
302
311 device = cudaCpuDeviceId;
312 }
315 device = cudaCpuDeviceId;
316 }
317 else
318 return;
319 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
320 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, device));
321 }
322 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
323 {
324 auto v = buffer.bytes();
325 const void* ptr = v.data();
326 size_t count = v.size();
327 int device = device_id.asInt32();
329
338 device = cudaCpuDeviceId;
339 }
342 device = cudaCpuDeviceId;
343 }
344 else
345 return;
346 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, device));
347 }
348
349 void setCurrentDevice(DeviceId device_id) final
350 {
351 Int32 id = device_id.asInt32();
352 if (!device_id.isAccelerator())
353 ARCANE_FATAL("Device {0} is not an accelerator device", id);
354 ARCANE_CHECK_CUDA(cudaSetDevice(id));
355 }
356
357 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
358
359 void startProfiling() override
360 {
361 global_cupti_info.start();
362 }
363
364 void stopProfiling() override
365 {
366 global_cupti_info.stop();
367 }
368
369 bool isProfilingActive() override
370 {
371 return global_cupti_info.isActive();
372 }
373
374 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
375 {
377 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
378 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
379 // que le type CUDA correspondant donc on peut faire un cast simple.
380 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
381 _fillPointerAttribute(attribute, mem_type, ca.device,
382 ptr, ca.devicePointer, ca.hostPointer);
383 }
384
385 void pushProfilerRange(const String& name, Int32 color_rgb) override
386 {
387#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
388 if (color_rgb >= 0) {
389 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
390 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
392 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
393 eventAttrib.version = NVTX_VERSION;
395 eventAttrib.colorType = NVTX_COLOR_ARGB;
396 eventAttrib.color = color_rgb;
398 eventAttrib.message.ascii = name.localstr();
400 }
401 else
402 nvtxRangePush(name.localstr());
403#endif
404 }
405 void popProfilerRange() override
406 {
407#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
408 nvtxRangePop();
409#endif
410 }
411
412 void finalize(ITraceMng* tm) override
413 {
414 finalizeCudaMemoryAllocators(tm);
415 }
416
417 public:
418
419 void fillDevices(bool is_verbose);
420
421 private:
422
423 Int64 m_nb_kernel_launched = 0;
424 bool m_is_verbose = false;
425 impl::DeviceInfoList m_device_info_list;
426};
427
428/*---------------------------------------------------------------------------*/
429/*---------------------------------------------------------------------------*/
430
431void CudaRunnerRuntime::
432fillDevices(bool is_verbose)
433{
434 int nb_device = 0;
435 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
436 std::ostream& omain = std::cout;
437 if (is_verbose)
438 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
439 for (int i = 0; i < nb_device; ++i) {
443 std::ostream& o = ostr.stream();
444 o << "Device " << i << " name=" << dp.name << "\n";
445 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
446 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
447 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
448 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
449 o << " warpSize = " << dp.warpSize << "\n";
450 o << " memPitch = " << dp.memPitch << "\n";
451 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
452 o << " totalConstMem = " << dp.totalConstMem << "\n";
453 o << " clockRate = " << dp.clockRate << "\n";
454 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
455 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
456 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
457 o << " integrated = " << dp.integrated << "\n";
458 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
459 o << " computeMode = " << dp.computeMode << "\n";
460 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
461 << " " << dp.maxThreadsDim[2] << "\n";
462 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
463 << " " << dp.maxGridSize[2] << "\n";
464 {
465 int least_val = 0;
466 int greatest_val = 0;
468 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
469 }
470 {
471 CUdevice device;
472 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
474 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
475 o << " deviceUuid=";
476 _printUUID(o, device_uuid.bytes);
477 o << "\n";
478 }
479 String description(ostr.str());
480 if (is_verbose)
481 omain << description;
482
484 device_info.setDescription(description);
485 device_info.setDeviceId(DeviceId(i));
486 device_info.setName(dp.name);
487 m_device_info_list.addDevice(device_info);
488 }
489
490 Int32 global_cupti_level = 0;
491
492 // Regarde si on active Cupti
493 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
494 global_cupti_level = v.value();
495 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
496 global_cupti_flush = v.value();
497 bool do_print_cupti = true;
498 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
499 do_print_cupti = (v.value() != 0);
500
501 if (global_cupti_level > 0) {
502#ifndef ARCANE_HAS_CUDA_CUPTI
503 ARCANE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
504#endif
505 global_cupti_info.init(global_cupti_level, do_print_cupti);
506 global_cupti_info.start();
507 }
508}
509
510/*---------------------------------------------------------------------------*/
511/*---------------------------------------------------------------------------*/
512
514: public IMemoryCopier
515{
516 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryRessource from_mem,
518 const RunQueue* queue) override
519 {
520 if (queue) {
521 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
522 return;
523 }
524 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
525 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
526 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
527 ARCANE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
528 }
529};
530
531/*---------------------------------------------------------------------------*/
532/*---------------------------------------------------------------------------*/
533
534} // End namespace Arcane::Accelerator::Cuda
535
536namespace
537{
540} // namespace
541
542/*---------------------------------------------------------------------------*/
543/*---------------------------------------------------------------------------*/
544
545// Cette fonction est le point d'entrée utilisé lors du chargement
546// dynamique de cette bibliothèque
547extern "C" ARCANE_EXPORT void
548arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
549{
550 using namespace Arcane;
551 using namespace Arcane::Accelerator::Cuda;
552 Arcane::Accelerator::impl::setUsingCUDARuntime(true);
553 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
554 initializeCudaMemoryAllocators();
557 mrm->setIsAccelerator(true);
558 mrm->setAllocator(eMemoryRessource::UnifiedMemory, getCudaUnifiedMemoryAllocator());
559 mrm->setAllocator(eMemoryRessource::HostPinned, getCudaHostPinnedMemoryAllocator());
560 mrm->setAllocator(eMemoryRessource::Device, getCudaDeviceMemoryAllocator());
561 mrm->setCopier(&global_cuda_memory_copier);
562 global_cuda_runtime.fillDevices(init_info.isVerbose());
563}
564
565/*---------------------------------------------------------------------------*/
566/*---------------------------------------------------------------------------*/
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void * _internalImpl() override
Pointeur sur la structure interne dépendante de l'implémentation.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Identifiant d'un composant du système.
Definition DeviceId.h:33
bool isHost() const
Indique si l'instance est associée à l'hôte.
Definition DeviceId.h:60
Int32 asInt32() const
Valeur numérique du device.
Definition DeviceId.h:69
Information sur un device.
Definition DeviceInfo.h:32
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Definition Memory.h:63
Arguments pour le préfetching mémoire.
Definition Memory.h:125
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:313
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:230
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à une RunQueue.
Implémentation d'une commande pour accélérateur.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual IMemoryRessourceMngInternal * _internal()=0
Interface interne.
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Definition Lima.cc:120
Flot de sortie lié à une String.
Interface du gestionnaire de traces.
Chaîne de caractères unicode.
eMemoryAdvice
Conseils pour la gestion mémoire.
Definition Memory.h:36
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
IMemoryRessourceMng * getDataMemoryRessourceMng()
Gestionnaire de ressource mémoire pour les données.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
eMemoryRessource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Definition ArcaneTypes.h:24
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.