14#include "arccore/accelerator_native/HipAccelerator.h"
16#include "arccore/base/FatalErrorException.h"
18#include "arccore/common/internal/MemoryUtilsInternal.h"
19#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21#include "arccore/common/accelerator/RunQueueBuildInfo.h"
22#include "arccore/common/accelerator/Memory.h"
23#include "arccore/common/accelerator/DeviceInfoList.h"
24#include "arccore/common/accelerator/KernelLaunchArgs.h"
25#include "arccore/common/accelerator/RunQueue.h"
26#include "arccore/common/accelerator/DeviceMemoryInfo.h"
27#include "arccore/common/accelerator/NativeStream.h"
28#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
29#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
30#include "arccore/common/accelerator/internal/RunCommandImpl.h"
31#include "arccore/common/accelerator/internal/IRunQueueStream.h"
32#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
33#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
37#ifdef ARCCORE_HAS_ROCTX
43namespace Arcane::Accelerator::Hip
57 virtual hipError_t _allocate(
void** ptr,
size_t new_size) = 0;
58 virtual hipError_t _deallocate(
void* ptr) = 0;
64template <
typename ConcreteAllocatorType>
65class UnderlyingAllocator
70 UnderlyingAllocator() =
default;
77 ARCCORE_CHECK_HIP(m_concrete_allocator._allocate(&out, size));
80 void freeMemory(
void* ptr, [[maybe_unused]]
size_t size)
final
82 ARCCORE_CHECK_HIP_NOTHROW(m_concrete_allocator._deallocate(ptr));
85 void doMemoryCopy(
void* destination,
const void* source,
Int64 size)
final
87 ARCCORE_CHECK_HIP(hipMemcpy(destination, source, size, hipMemcpyDefault));
92 return m_concrete_allocator.memoryResource();
97 ConcreteAllocatorType m_concrete_allocator;
108 hipError_t _deallocate(
void* ptr)
final
110 return ::hipFree(ptr);
113 hipError_t _allocate(
void** ptr,
size_t new_size)
final
115 auto r = ::hipMallocManaged(ptr, new_size, hipMemAttachGlobal);
125class UnifiedMemoryHipMemoryAllocator
126:
public AcceleratorMemoryAllocatorBase
130 UnifiedMemoryHipMemoryAllocator()
151 hipError_t _allocate(
void** ptr,
size_t new_size)
final
153 return ::hipHostMalloc(ptr, new_size);
155 hipError_t _deallocate(
void* ptr)
final
157 return ::hipHostFree(ptr);
165class HostPinnedHipMemoryAllocator
166:
public AcceleratorMemoryAllocatorBase
171 HostPinnedHipMemoryAllocator()
187class DeviceConcreteAllocator
192 DeviceConcreteAllocator()
196 hipError_t _allocate(
void** ptr,
size_t new_size)
final
198 hipError_t r = ::hipMalloc(ptr, new_size);
201 hipError_t _deallocate(
void* ptr)
final
203 return ::hipFree(ptr);
212class DeviceHipMemoryAllocator
213:
public AcceleratorMemoryAllocatorBase
218 DeviceHipMemoryAllocator()
244void initializeHipMemoryAllocators()
246 unified_memory_hip_memory_allocator.initialize();
247 device_hip_memory_allocator.initialize();
248 host_pinned_hip_memory_allocator.initialize();
251void finalizeHipMemoryAllocators(
ITraceMng* tm)
253 unified_memory_hip_memory_allocator.finalize(tm);
254 device_hip_memory_allocator.finalize(tm);
255 host_pinned_hip_memory_allocator.finalize(tm);
261class HipRunQueueStream
270 ARCCORE_CHECK_HIP(hipStreamCreate(&m_hip_stream));
272 int priority = bi.priority();
273 ARCCORE_CHECK_HIP(hipStreamCreateWithPriority(&m_hip_stream, hipStreamDefault, priority));
276 ~HipRunQueueStream()
override
278 ARCCORE_CHECK_HIP_NOTHROW(hipStreamDestroy(m_hip_stream));
285#ifdef ARCCORE_HAS_ROCTX
286 auto kname = c.kernelName();
288 roctxRangePush(c.traceInfo().name());
290 roctxRangePush(kname.localstr());
292 return m_runtime->notifyBeginLaunchKernel();
296#ifdef ARCCORE_HAS_ROCTX
299 return m_runtime->notifyEndLaunchKernel();
303 ARCCORE_CHECK_HIP(hipStreamSynchronize(m_hip_stream));
307 return hipStreamSynchronize(m_hip_stream) != hipSuccess;
311 auto r = hipMemcpyAsync(args.destination().
data(), args.source().
data(),
312 args.source().
bytes().
size(), hipMemcpyDefault, m_hip_stream);
313 ARCCORE_CHECK_HIP(r);
319 auto src = args.source().
bytes();
323 int device = hipCpuDeviceId;
326 auto r = hipMemPrefetchAsync(src.data(), src.size(), device, m_hip_stream);
327 ARCCORE_CHECK_HIP(r);
338 hipStream_t trueStream()
const
346 hipStream_t m_hip_stream;
352class HipRunQueueEvent
357 explicit HipRunQueueEvent(
bool has_timer)
360 ARCCORE_CHECK_HIP(hipEventCreate(&m_hip_event));
362 ARCCORE_CHECK_HIP(hipEventCreateWithFlags(&m_hip_event, hipEventDisableTiming));
364 ~HipRunQueueEvent()
override
366 ARCCORE_CHECK_HIP_NOTHROW(hipEventDestroy(m_hip_event));
375 ARCCORE_CHECK_HIP(hipEventRecord(m_hip_event, rq->trueStream()));
380 ARCCORE_CHECK_HIP(hipEventSynchronize(m_hip_event));
386 ARCCORE_CHECK_HIP(hipStreamWaitEvent(rq->trueStream(), m_hip_event, 0));
389 Int64 elapsedTime(IRunQueueEventImpl* from_event)
final
391 auto* true_from_event =
static_cast<HipRunQueueEvent*
>(from_event);
393 float time_in_ms = 0.0;
394 ARCCORE_CHECK_HIP(hipEventElapsedTime(&time_in_ms, true_from_event->m_hip_event, m_hip_event));
395 double x = time_in_ms * 1.0e6;
400 bool hasPendingWork()
final
402 hipError_t v = hipEventQuery(m_hip_event);
403 if (v == hipErrorNotReady)
405 ARCCORE_CHECK_HIP(v);
411 hipEvent_t m_hip_event;
426 void notifyBeginLaunchKernel()
override
428 ++m_nb_kernel_launched;
430 std::cout <<
"BEGIN HIP KERNEL!\n";
432 void notifyEndLaunchKernel()
override
434 ARCCORE_CHECK_HIP(hipGetLastError());
436 std::cout <<
"END HIP KERNEL!\n";
438 void barrier()
override
440 ARCCORE_CHECK_HIP(hipDeviceSynchronize());
460 auto v = buffer.
bytes();
461 const void* ptr = v.
data();
462 size_t count = v.size();
463 int device = device_id.
asInt32();
464 hipMemoryAdvise hip_advise;
467 hip_advise = hipMemAdviseSetReadMostly;
469 hip_advise = hipMemAdviseSetPreferredLocation;
471 hip_advise = hipMemAdviseSetAccessedBy;
473 hip_advise = hipMemAdviseSetPreferredLocation;
474 device = hipCpuDeviceId;
477 hip_advise = hipMemAdviseSetAccessedBy;
478 device = hipCpuDeviceId;
483 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
487 auto v = buffer.
bytes();
488 const void* ptr = v.
data();
489 size_t count = v.size();
490 int device = device_id.
asInt32();
491 hipMemoryAdvise hip_advise;
494 hip_advise = hipMemAdviseUnsetReadMostly;
496 hip_advise = hipMemAdviseUnsetPreferredLocation;
498 hip_advise = hipMemAdviseUnsetAccessedBy;
500 hip_advise = hipMemAdviseUnsetPreferredLocation;
501 device = hipCpuDeviceId;
504 hip_advise = hipMemAdviseUnsetAccessedBy;
505 device = hipCpuDeviceId;
509 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
512 void setCurrentDevice(
DeviceId device_id)
final
516 ARCCORE_CHECK_HIP(hipSetDevice(
id));
518 const IDeviceInfoList* deviceInfoList()
override {
return &m_device_info_list; }
520 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
522 hipPointerAttribute_t pa;
523 hipError_t ret_value = hipPointerGetAttributes(&pa, ptr);
524 auto mem_type = ePointerMemoryType::Unregistered;
528 if (ret_value==hipSuccess){
529#if HIP_VERSION_MAJOR >= 6
530 auto rocm_memory_type = pa.type;
532 auto rocm_memory_type = pa.memoryType;
535 mem_type = ePointerMemoryType::Managed;
536 else if (rocm_memory_type == hipMemoryTypeHost)
537 mem_type = ePointerMemoryType::Host;
538 else if (rocm_memory_type == hipMemoryTypeDevice)
539 mem_type = ePointerMemoryType::Device;
546 _fillPointerAttribute(attribute, mem_type, pa.device,
547 ptr, pa.devicePointer, pa.hostPointer);
553 int wanted_d = device_id.
asInt32();
554 ARCCORE_CHECK_HIP(hipGetDevice(&d));
556 ARCCORE_CHECK_HIP(hipSetDevice(wanted_d));
558 size_t total_mem = 0;
559 ARCCORE_CHECK_HIP(hipMemGetInfo(&free_mem, &total_mem));
561 ARCCORE_CHECK_HIP(hipSetDevice(d));
563 dmi.setFreeMemory(free_mem);
564 dmi.setTotalMemory(total_mem);
568 void pushProfilerRange(
const String& name, [[maybe_unused]]
Int32 color)
override
570#ifdef ARCCORE_HAS_ROCTX
574 void popProfilerRange()
override
576#ifdef ARCCORE_HAS_ROCTX
583 finalizeHipMemoryAllocators(tm);
588 void fillDevices(
bool is_verbose);
592 Int64 m_nb_kernel_launched = 0;
593 bool m_is_verbose =
false;
600void HipRunnerRuntime::
601fillDevices(
bool is_verbose)
604 ARCCORE_CHECK_HIP(hipGetDeviceCount(&nb_device));
605 std::ostream& omain = std::cout;
607 omain <<
"ArcaneHIP: Initialize Arcane HIP runtime nb_available_device=" << nb_device <<
"\n";
608 for (
int i = 0; i < nb_device; ++i) {
609 std::ostringstream ostr;
610 std::ostream& o = ostr;
613 ARCCORE_CHECK_HIP(hipGetDeviceProperties(&dp, i));
615 int has_managed_memory = 0;
616 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&has_managed_memory, hipDeviceAttributeManagedMemory, i));
621 int runtime_version = 0;
622 ARCCORE_CHECK_HIP(hipRuntimeGetVersion(&runtime_version));
624 int runtime_major = runtime_version / 10000000;
625 int runtime_minor = (runtime_version / 100000) % 100;
627 int driver_version = 0;
628 ARCCORE_CHECK_HIP(hipDriverGetVersion(&driver_version));
630 int driver_major = driver_version / 10000000;
631 int driver_minor = (driver_version / 100000) % 100;
633 o <<
"\nDevice " << i <<
" name=" << dp.name <<
"\n";
634 o <<
" Driver version = " << driver_major <<
"." << (driver_minor) <<
"." << (driver_version % 100000) <<
"\n";
635 o <<
" Runtime version = " << runtime_major <<
"." << (runtime_minor) <<
"." << (runtime_version % 100000) <<
"\n";
636 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
637 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
638 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
639 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
640 o <<
" warpSize = " << dp.warpSize <<
"\n";
641 o <<
" memPitch = " << dp.memPitch <<
"\n";
642 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
643 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
644 o <<
" clockRate = " << dp.clockRate <<
"\n";
646 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
647 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
648 o <<
" integrated = " << dp.integrated <<
"\n";
649 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
650 o <<
" computeMode = " << dp.computeMode <<
"\n";
651 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
652 <<
" " << dp.maxThreadsDim[2] <<
"\n";
653 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
654 <<
" " << dp.maxGridSize[2] <<
"\n";
655 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
656 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
657 o <<
" gcnArchName = " << dp.gcnArchName <<
"\n";
658 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
659 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
660 o <<
" hasManagedMemory = " << has_managed_memory <<
"\n";
661#if HIP_VERSION_MAJOR >= 6
662 o <<
" gpuDirectRDMASupported = " << dp.gpuDirectRDMASupported <<
"\n";
663 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
664 o <<
" unifiedFunctionPointers = " << dp.unifiedFunctionPointers <<
"\n";
668 ARCCORE_CHECK_HIP(hipDeviceGet(&device, i));
670 ARCCORE_CHECK_HIP(hipDeviceGetUuid(&device_uuid, device));
672 impl::printUUID(o, device_uuid.bytes);
676 String description(ostr.str());
678 omain << description;
681 device_info.setDescription(description);
682 device_info.setDeviceId(
DeviceId(i));
683 device_info.setName(dp.name);
684 device_info.setWarpSize(dp.warpSize);
685 m_device_info_list.addDevice(device_info);
706 ARCCORE_CHECK_HIP(hipMemcpy(to.
data(), from.
data(), from.
bytes().
size(), hipMemcpyDefault));
726extern "C" ARCCORE_EXPORT
void
730 using namespace Arcane::Accelerator::Hip;
731 Arcane::Accelerator::impl::setUsingHIPRuntime(
true);
732 Arcane::Accelerator::impl::setHIPRunQueueRuntime(&global_hip_runtime);
733 initializeHipMemoryAllocators();
741 mrm->
setCopier(&global_hip_memory_copier);
742 global_hip_runtime.fillDevices(init_info.isVerbose());
#define ARCCORE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCCORE_FATAL_IF(cond,...)
Macro envoyant une exception FatalErrorException si cond est vrai.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un device.
Information mémoire d'un accélérateur.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
Interface d'une liste de devices.
Type opaque pour encapsuler une 'stream' native.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.
Espace de nom de Arccore.