Arcane  v3.14.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAccelerator.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAccelerator.cc (C) 2000-2024 */
9/* */
10/* Backend 'CUDA' pour les accélérateurs. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/accelerator/cuda/CudaAccelerator.h"
15
16#include "arcane/utils/PlatformUtils.h"
17#include "arcane/utils/Array.h"
18#include "arcane/utils/TraceInfo.h"
19#include "arcane/utils/NotSupportedException.h"
20#include "arcane/utils/FatalErrorException.h"
21#include "arcane/utils/ValueConvert.h"
22#include "arcane/utils/IMemoryAllocator.h"
23#include "arcane/utils/OStringStream.h"
24#include "arcane/utils/ITraceMng.h"
25#include "arcane/utils/internal/MemoryPool.h"
26
27#include "arcane/accelerator/core/internal/MemoryTracer.h"
28
29#include <iostream>
30
31/*---------------------------------------------------------------------------*/
32/*---------------------------------------------------------------------------*/
33
34namespace Arcane::Accelerator::Cuda
35{
36using namespace Arcane::impl;
37using namespace Arccore;
38
39/*---------------------------------------------------------------------------*/
40/*---------------------------------------------------------------------------*/
41
43enum class MemoryPoolFlags
44{
45 UVM = 1,
46 Device = 2,
47 HostPinned = 4
48};
49
50/*---------------------------------------------------------------------------*/
51/*---------------------------------------------------------------------------*/
52
53void arcaneCheckCudaErrors(const TraceInfo& ti, cudaError_t e)
54{
55 if (e != cudaSuccess)
56 ARCANE_FATAL("CUDA Error trace={0} e={1} str={2}", ti, e, cudaGetErrorString(e));
57}
58
59/*---------------------------------------------------------------------------*/
60/*---------------------------------------------------------------------------*/
61
62void arcaneCheckCudaErrorsNoThrow(const TraceInfo& ti, cudaError_t e)
63{
64 if (e == cudaSuccess)
65 return;
66 String str = String::format("CUDA Error trace={0} e={1} str={2}", ti, e, cudaGetErrorString(e));
68 ex.write(std::cerr);
69}
70
71/*---------------------------------------------------------------------------*/
72/*---------------------------------------------------------------------------*/
73
74/*---------------------------------------------------------------------------*/
75/*---------------------------------------------------------------------------*/
88{
89 public:
90
91 void initialize(Int64 block_size, bool do_block_alloc)
92 {
93 m_block_size = block_size;
94 if (m_block_size <= 0)
95 m_block_size = 128;
97 }
98
99 void dumpStats(std::ostream& ostr, const String& name)
100 {
101 ostr << "Allocator '" << name << "' : nb_allocate=" << m_nb_allocate
102 << " nb_unaligned=" << m_nb_unaligned_allocate
103 << "\n";
104 }
105
106 Int64 adjustedCapacity(Int64 wanted_capacity, Int64 element_size) const
107 {
108 const bool do_page = m_do_block_allocate;
109 if (!do_page)
110 return wanted_capacity;
111 // Alloue un multiple de la taille d'un bloc
112 // Pour la mémoire unifiée, la taille de bloc est une page mémoire.
113 // Comme les transfers de la mémoire unifiée se font par page,
114 // cela permet de détecter quelles allocations provoquent le transfert.
115 // On se débrouille aussi pour limiter les différentes taille
116 // de bloc alloué pour éviter d'avoir trop de blocs de taille
117 // différente pour que l'éventuel MemoryPool ne contienne trop
118 // de valeurs.
121 Int64 block_size = m_block_size;
122 Int64 nb_iter = 4 + (4096 / block_size);
123 for (Int64 i = 0; i < nb_iter; ++i) {
124 if (new_size >= (4 * block_size))
125 block_size *= 4;
126 else
127 break;
128 }
129 new_size = _computeNextMultiple(new_size, block_size);
133 return wanted_capacity;
134 }
135
136 void doAllocate(void* ptr, [[maybe_unused]] size_t new_size)
137 {
140 uintptr_t addr = reinterpret_cast<uintptr_t>(ptr);
141 if ((addr % m_block_size) != 0) {
143 }
144 }
145 }
146
147 private:
148
154 std::atomic<Int32> m_nb_allocate = 0;
156 std::atomic<Int32> m_nb_unaligned_allocate = 0;
157
158 private:
159
160 // Calcule la plus petite valeur de \a multiple de \a multiple
161 static Int64 _computeNextMultiple(Int64 n, Int64 multiple)
162 {
163 Int64 new_n = n / multiple;
164 if ((n % multiple) != 0)
165 ++new_n;
166 return (new_n * multiple);
167 }
168};
169
170/*---------------------------------------------------------------------------*/
171/*---------------------------------------------------------------------------*/
177{
178 public:
179
181
182 public:
183
185 {
186 public:
187
188 virtual ~ConcreteAllocator() = default;
189
190 public:
191
192 virtual cudaError_t _allocate(void** ptr, size_t new_size) = 0;
193 virtual cudaError_t _deallocate(void* ptr) = 0;
194 };
195
197 : public IMemoryPoolAllocator
198 {
199 public:
200
202 : m_base(v)
203 {
204 }
205
206 public:
207
208 void* allocateMemory(size_t size) override
209 {
210 void* out = nullptr;
211 ARCANE_CHECK_CUDA(m_base->m_concrete_allocator->_allocate(&out, size));
212 m_base->m_block_wrapper.doAllocate(out, size);
213 return out;
214 }
215 void freeMemory(void* ptr, [[maybe_unused]] size_t size) override
216 {
217 ARCANE_CHECK_CUDA_NOTHROW(m_base->m_concrete_allocator->_deallocate(ptr));
218 }
219
220 public:
221
222 CudaMemoryAllocatorBase* m_base = nullptr;
223 };
224
225 public:
226
229 , m_concrete_allocator(concrete_allocator)
230 , m_direct_sub_allocator(this)
231 , m_memory_pool(&m_direct_sub_allocator, allocator_name)
232 , m_sub_allocator(&m_direct_sub_allocator)
233 , m_allocator_name(allocator_name)
234 {
235 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_MEMORY_PRINT_LEVEL", true))
236 m_print_level = v.value();
237 }
238
239 ~CudaMemoryAllocatorBase()
240 {
241 }
242
243 public:
244
245 void finalize(ITraceMng* tm)
246 {
247 if (m_print_level >= 1) {
249 if (m_use_memory_pool) {
250 m_memory_pool.dumpStats(ostr());
251 m_memory_pool.dumpFreeMap(ostr());
252 }
253 ostr() << "Allocator '" << m_allocator_name << "' nb_realloc=" << m_nb_reallocate
254 << " realloc_copy=" << m_reallocate_size << "\n";
255 m_block_wrapper.dumpStats(ostr(), m_allocator_name);
256 if (tm)
257 tm->info() << ostr.str();
258 else
259 std::cout << ostr.str();
260 }
261
262 m_memory_pool.freeCachedMemory();
263 }
264
265 public:
266
267 bool hasRealloc(MemoryAllocationArgs) const final { return true; }
269 {
270 void* out = m_sub_allocator->allocateMemory(new_size);
271 Int64 a = reinterpret_cast<Int64>(out);
272 if ((a % 128) != 0)
273 ARCANE_FATAL("Bad alignment for CUDA allocator: offset={0}", (a % 128));
274 m_tracer.traceAllocate(out, new_size, args);
275 _applyHint(out, new_size, args);
276 return { out, new_size };
277 }
279 {
280 ++m_nb_reallocate;
282 m_reallocate_size += current_size;
283 String array_name = args.arrayName();
284 const bool do_print = (m_print_level >= 2);
285 if (do_print) {
286 std::cout << "Reallocate allocator=" << m_allocator_name
287 << " current_size=" << current_size
288 << " current_capacity=" << current_info.capacity()
289 << " new_capacity=" << new_size
290 << " ptr=" << current_info.baseAddress();
291 if (array_name.null() && m_print_level >= 3) {
292 std::cout << " stack=" << platform::getStackTrace();
293 }
294 else {
295 std::cout << " name=" << array_name;
296 if (m_print_level >= 4)
297 std::cout << " stack=" << platform::getStackTrace();
298 }
299 std::cout << "\n";
300 }
301 if (m_use_memory_pool)
302 _removeHint(current_info.baseAddress(), current_size, args);
304 // TODO: supprimer les Hint après le deallocate car la zone mémoire peut être réutilisée.
305 ARCANE_CHECK_CUDA(cudaMemcpy(a.baseAddress(), current_info.baseAddress(), current_size, cudaMemcpyDefault));
307 return a;
308 }
310 {
311 void* ptr = mem_info.baseAddress();
312 size_t mem_size = mem_info.capacity();
313 if (m_use_memory_pool)
314 _removeHint(ptr, mem_size, args);
315 // Ne lève pas d'exception en cas d'erreurs lors de la désallocation
316 // car elles ont souvent lieu dans les destructeurs et cela provoque
317 // un arrêt du code par std::terminate().
318 m_tracer.traceDeallocate(mem_info, args);
319 m_sub_allocator->freeMemory(ptr, mem_size);
320 }
321
327
328 protected:
329
330 virtual void _applyHint([[maybe_unused]] void* ptr, [[maybe_unused]] size_t new_size,
332 virtual void _removeHint([[maybe_unused]] void* ptr, [[maybe_unused]] size_t new_size,
334
335 private:
336
337 impl::MemoryTracerWrapper m_tracer;
338 std::unique_ptr<ConcreteAllocator> m_concrete_allocator;
339 UnderlyingAllocator m_direct_sub_allocator;
340 MemoryPool m_memory_pool;
341 IMemoryPoolAllocator* m_sub_allocator = nullptr;
342 bool m_use_memory_pool = false;
343 String m_allocator_name;
344 std::atomic<Int32> m_nb_reallocate = 0;
345 std::atomic<Int64> m_reallocate_size = 0;
346 Int32 m_print_level = 0;
347
348 protected:
349
350 BlockAllocatorWrapper m_block_wrapper;
351
352 protected:
353
354 void _setTraceLevel(Int32 v) { m_tracer.setTraceLevel(v); }
355 // IMPORTANT: doit être appelé avant toute allocation et ne plus être modifié ensuite.
356 void _setUseMemoryPool(bool is_used)
357 {
358 IMemoryPoolAllocator* mem_pool = &m_memory_pool;
359 IMemoryPoolAllocator* direct = &m_direct_sub_allocator;
360 m_sub_allocator = (is_used) ? mem_pool : direct;
361 m_use_memory_pool = is_used;
362 if (is_used) {
363 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_MEMORY_POOL_MAX_BLOCK_SIZE", true)) {
364 if (v.value() < 0)
365 ARCANE_FATAL("Invalid value '{0}' for memory pool max block size");
366 size_t block_size = static_cast<size_t>(v.value());
367 m_memory_pool.setMaxCachedBlockSize(block_size);
368 }
369 }
370 }
371};
372
373/*---------------------------------------------------------------------------*/
374/*---------------------------------------------------------------------------*/
384{
385 public:
386
388 : public ConcreteAllocator
389 {
390 public:
391
392 Allocator()
393 {
394 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
395 m_use_ats = v.value();
396 }
397
398 cudaError_t _deallocate(void* ptr) final
399 {
400 if (m_use_ats) {
401 ::free(ptr);
402 return cudaSuccess;
403 }
404 //std::cout << "CUDA_MANAGED_FREE ptr=" << ptr << "\n";
405 return ::cudaFree(ptr);
406 }
407
408 cudaError_t _allocate(void** ptr, size_t new_size) final
409 {
410 if (m_use_ats) {
411 *ptr = ::aligned_alloc(128, new_size);
412 }
413 else {
414 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
415 //std::cout << "CUDA_MANAGED_MALLOC ptr=" << (*ptr) << " size=" << new_size << "\n";
416 //if (new_size < 4000)
417 //std::cout << "STACK=" << platform::getStackTrace() << "\n";
418
419 if (r != cudaSuccess)
420 return r;
421 }
422
423 return cudaSuccess;
424 }
425
426 public:
427
428 bool m_use_ats = false;
429 };
430
431 public:
432
434 : CudaMemoryAllocatorBase("UnifiedMemoryCudaMemory", new Allocator())
435 {
436 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MALLOC_TRACE", true))
437 _setTraceLevel(v.value());
438 }
439
440 void initialize()
441 {
442 bool do_page_allocate = true;
443 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_UM_PAGE_ALLOC", true))
444 do_page_allocate = (v.value() != 0);
446 m_block_wrapper.initialize(page_size, do_page_allocate);
447
448 bool use_memory_pool = false;
449 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_MEMORY_POOL", true))
450 use_memory_pool = (v.value() & static_cast<int>(MemoryPoolFlags::UVM)) != 0;
451 _setUseMemoryPool(use_memory_pool);
452 }
453
454 public:
455
458 {
459 void* p = ptr.baseAddress();
460 Int64 s = ptr.capacity();
461 if (p && s > 0)
462 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
463 }
464
465 protected:
466
467 void _applyHint(void* p, size_t new_size, MemoryAllocationArgs args)
468 {
469 eMemoryLocationHint hint = args.memoryLocationHint();
470
471 // Utilise le device actif pour positionner le GPU par défaut
472 // On ne le fait que si le \a hint le nécessite pour éviter d'appeler
473 // cudaGetDevice() à chaque fois.
474 int device_id = 0;
475 if (hint == eMemoryLocationHint::MainlyDevice || hint == eMemoryLocationHint::HostAndDeviceMostlyRead) {
477 }
478 //std::cout << "SET_MEMORY_HINT name=" << args.arrayName() << " size=" << new_size << " hint=" << (int)hint << "\n";
479 if (hint == eMemoryLocationHint::MainlyDevice || hint == eMemoryLocationHint::HostAndDeviceMostlyRead) {
482 }
483 if (hint == eMemoryLocationHint::MainlyHost) {
485 //ARCANE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, 0));
486 }
487 if (hint == eMemoryLocationHint::HostAndDeviceMostlyRead) {
489 }
490 }
491 void _removeHint(void* p, size_t size, MemoryAllocationArgs args)
492 {
493 eMemoryLocationHint hint = args.memoryLocationHint();
494 if (hint == eMemoryLocationHint::None)
495 return;
496 int device_id = 0;
497 ARCANE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, device_id));
498 }
499
500 private:
501
502 bool m_use_ats = false;
503};
504
505/*---------------------------------------------------------------------------*/
506/*---------------------------------------------------------------------------*/
507
510{
511 public:
512
514 : public ConcreteAllocator
515 {
516 public:
517
518 cudaError_t _allocate(void** ptr, size_t new_size) final
519 {
520 return ::cudaMallocHost(ptr, new_size);
521 }
522 cudaError_t _deallocate(void* ptr) final
523 {
524 return ::cudaFreeHost(ptr);
525 }
526 };
527
528 public:
529
531 : CudaMemoryAllocatorBase("HostPinnedCudaMemory", new Allocator())
532 {
533 }
534
535 public:
536
537 void initialize()
538 {
539 bool use_memory_pool = false;
540 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_MEMORY_POOL", true))
541 use_memory_pool = (v.value() & static_cast<int>(MemoryPoolFlags::HostPinned)) != 0;
542 _setUseMemoryPool(use_memory_pool);
543 m_block_wrapper.initialize(128, use_memory_pool);
544 }
545};
546
547/*---------------------------------------------------------------------------*/
548/*---------------------------------------------------------------------------*/
549
552{
553
555 : public ConcreteAllocator
556 {
557 public:
558
559 Allocator()
560 {
561 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))
562 m_use_ats = v.value();
563 }
564
565 cudaError_t _allocate(void** ptr, size_t new_size) final
566 {
567 if (m_use_ats) {
568 // FIXME: it does not work on WIN32
569 *ptr = std::aligned_alloc(128, new_size);
570 if (*ptr)
571 return cudaSuccess;
573 }
574 cudaError_t r = ::cudaMalloc(ptr, new_size);
575 //std::cout << "ALLOCATE_DEVICE ptr=" << (*ptr) << " size=" << new_size << " r=" << (int)r << "\n";
576 return r;
577 }
578 cudaError_t _deallocate(void* ptr) final
579 {
580 if (m_use_ats) {
581 std::free(ptr);
582 return cudaSuccess;
583 }
584 //std::cout << "FREE_DEVICE ptr=" << ptr << "\n";
585 return ::cudaFree(ptr);
586 }
587
588 private:
589
590 bool m_use_ats = false;
591 };
592
593 public:
594
596 : CudaMemoryAllocatorBase("DeviceCudaMemoryAllocator", new Allocator())
597 {
598 }
599
600 public:
601
602 void initialize()
603 {
604 bool use_memory_pool = false;
605 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_MEMORY_POOL", true))
606 use_memory_pool = (v.value() & static_cast<int>(MemoryPoolFlags::Device)) != 0;
607 _setUseMemoryPool(use_memory_pool);
608 m_block_wrapper.initialize(128, use_memory_pool);
609 }
610};
611
612/*---------------------------------------------------------------------------*/
613/*---------------------------------------------------------------------------*/
614
615namespace
616{
617 UnifiedMemoryCudaMemoryAllocator unified_memory_cuda_memory_allocator;
618 HostPinnedCudaMemoryAllocator host_pinned_cuda_memory_allocator;
619 DeviceCudaMemoryAllocator device_cuda_memory_allocator;
620} // namespace
621
622/*---------------------------------------------------------------------------*/
623/*---------------------------------------------------------------------------*/
624
626getCudaMemoryAllocator()
627{
629}
630
632getCudaDeviceMemoryAllocator()
633{
635}
636
638getCudaUnifiedMemoryAllocator()
639{
641}
642
644getCudaHostPinnedMemoryAllocator()
645{
647}
648
649/*---------------------------------------------------------------------------*/
650/*---------------------------------------------------------------------------*/
651
652void initializeCudaMemoryAllocators()
653{
655 device_cuda_memory_allocator.initialize();
657}
658
659void finalizeCudaMemoryAllocators(ITraceMng* tm)
660{
664}
665
666/*---------------------------------------------------------------------------*/
667/*---------------------------------------------------------------------------*/
668
669} // namespace Arcane::Accelerator::Cuda
670
671/*---------------------------------------------------------------------------*/
672/*---------------------------------------------------------------------------*/
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe commune pour gérer l'allocation par bloc.
bool m_do_block_allocate
Indique si l'allocation en utilisant m_block_size.
std::atomic< Int32 > m_nb_allocate
Nombre d'allocations.
std::atomic< Int32 > m_nb_unaligned_allocate
Nombre d'allocations non alignées.
Int64 m_block_size
Taille d'un bloc. L'allocation sera un multiple de cette taille.
void freeMemory(void *ptr, size_t size) override
Libère le bloc situé à l'adresse address contenant size octets.
void * allocateMemory(size_t size) override
Alloue un bloc pour size octets.
Classe de base d'un allocateur spécifique pour 'Cuda'.
bool hasRealloc(MemoryAllocationArgs) const final
Indique si l'allocateur supporte la sémantique de realloc.
AllocatedMemoryInfo reallocate(MemoryAllocationArgs args, AllocatedMemoryInfo current_info, Int64 new_size) final
Réalloue de la mémoire pour new_size octets et retourne le pointeur.
AllocatedMemoryInfo allocate(MemoryAllocationArgs args, Int64 new_size) final
Alloue de la mémoire pour new_size octets et retourne le pointeur.
Int64 adjustedCapacity(MemoryAllocationArgs args, Int64 wanted_capacity, Int64 element_size) const final
Ajuste la capacité suivant la taille d'élément.
void deallocate(MemoryAllocationArgs args, AllocatedMemoryInfo mem_info) final
Libère la mémoire dont l'adresse de base est ptr.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifie du changement des arguments spécifiques à l'instance.
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Definition Lima.cc:120
Flot de sortie lié à une String.
Interface d'un allocateur pour un MemoryPool.
Definition MemoryPool.h:39
virtual void * allocateMemory(size_t size)=0
Alloue un bloc pour size octets.
virtual void freeMemory(void *address, size_t size)=0
Libère le bloc situé à l'adresse address contenant size octets.
Classe pour gérer une liste de zone allouées.
Definition MemoryPool.h:65
void setMaxCachedBlockSize(size_t v)
Positionne la taille en octet à partir de laquelle on ne conserve pas un bloc dans le cache.
void freeCachedMemory()
Libère la mémoire dans le cache.
Allocateur mémoire avec alignement mémoire spécifique.
Int64 adjustedCapacity(MemoryAllocationArgs args, Int64 wanted_capacity, Int64 element_size) const override
Ajuste la capacité suivant la taille d'élément.
Informations sur une zone mémoire allouée.
Exception lorsqu'une erreur fatale est survenue.
Interface du gestionnaire de traces.
Classe contenant des informations pour spécialiser les allocations.
Chaîne de caractères unicode.
TraceMessage info() const
Flot pour un message d'information.
Int64 getPageSize()
Taille des pages du système hôte en octets.
Espace de nom de Arccore.
Definition ArcaneTypes.h:24
eMemoryLocationHint
Indices sur la localisation mémoire attendue.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.