14#include "arccore/accelerator_native/HipAccelerator.h"
16#include "arccore/base/FatalErrorException.h"
18#include "arccore/common/internal/MemoryUtilsInternal.h"
19#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21#include "arccore/common/accelerator/RunQueueBuildInfo.h"
22#include "arccore/common/accelerator/Memory.h"
23#include "arccore/common/accelerator/DeviceInfoList.h"
24#include "arccore/common/accelerator/KernelLaunchArgs.h"
25#include "arccore/common/accelerator/RunQueue.h"
26#include "arccore/common/accelerator/DeviceMemoryInfo.h"
27#include "arccore/common/accelerator/NativeStream.h"
28#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
29#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
30#include "arccore/common/accelerator/internal/RunCommandImpl.h"
31#include "arccore/common/accelerator/internal/IRunQueueStream.h"
32#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
33#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
37#ifdef ARCCORE_HAS_ROCTX
43namespace Arcane::Accelerator::Hip
45using Impl::KernelLaunchArgs;
58 virtual hipError_t _allocate(
void** ptr,
size_t new_size) = 0;
59 virtual hipError_t _deallocate(
void* ptr) = 0;
65template <
typename ConcreteAllocatorType>
66class UnderlyingAllocator
71 UnderlyingAllocator() =
default;
78 ARCCORE_CHECK_HIP(m_concrete_allocator._allocate(&out, size));
81 void freeMemory(
void* ptr, [[maybe_unused]]
size_t size)
final
83 ARCCORE_CHECK_HIP_NOTHROW(m_concrete_allocator._deallocate(ptr));
86 void doMemoryCopy(
void* destination,
const void* source,
Int64 size)
final
88 ARCCORE_CHECK_HIP(hipMemcpy(destination, source, size, hipMemcpyDefault));
93 return m_concrete_allocator.memoryResource();
98 ConcreteAllocatorType m_concrete_allocator;
109 hipError_t _deallocate(
void* ptr)
final
111 return ::hipFree(ptr);
114 hipError_t _allocate(
void** ptr,
size_t new_size)
final
116 auto r = ::hipMallocManaged(ptr, new_size, hipMemAttachGlobal);
126class UnifiedMemoryHipMemoryAllocator
127:
public AcceleratorMemoryAllocatorBase
131 UnifiedMemoryHipMemoryAllocator()
152 hipError_t _allocate(
void** ptr,
size_t new_size)
final
154 return ::hipHostMalloc(ptr, new_size);
156 hipError_t _deallocate(
void* ptr)
final
158 return ::hipHostFree(ptr);
166class HostPinnedHipMemoryAllocator
167:
public AcceleratorMemoryAllocatorBase
172 HostPinnedHipMemoryAllocator()
188class DeviceConcreteAllocator
193 DeviceConcreteAllocator()
197 hipError_t _allocate(
void** ptr,
size_t new_size)
final
199 hipError_t r = ::hipMalloc(ptr, new_size);
202 hipError_t _deallocate(
void* ptr)
final
204 return ::hipFree(ptr);
213class DeviceHipMemoryAllocator
214:
public AcceleratorMemoryAllocatorBase
219 DeviceHipMemoryAllocator()
245void initializeHipMemoryAllocators()
247 unified_memory_hip_memory_allocator.initialize();
248 device_hip_memory_allocator.initialize();
249 host_pinned_hip_memory_allocator.initialize();
252void finalizeHipMemoryAllocators(
ITraceMng* tm)
254 unified_memory_hip_memory_allocator.finalize(tm);
255 device_hip_memory_allocator.finalize(tm);
256 host_pinned_hip_memory_allocator.finalize(tm);
262class HipRunQueueStream
271 ARCCORE_CHECK_HIP(hipStreamCreate(&m_hip_stream));
273 int priority = bi.priority();
274 ARCCORE_CHECK_HIP(hipStreamCreateWithPriority(&m_hip_stream, hipStreamDefault, priority));
277 ~HipRunQueueStream()
override
279 ARCCORE_CHECK_HIP_NOTHROW(hipStreamDestroy(m_hip_stream));
286#ifdef ARCCORE_HAS_ROCTX
287 auto kname = c.kernelName();
289 roctxRangePush(c.traceInfo().name());
291 roctxRangePush(kname.localstr());
293 return m_runtime->notifyBeginLaunchKernel();
297#ifdef ARCCORE_HAS_ROCTX
300 return m_runtime->notifyEndLaunchKernel();
304 ARCCORE_CHECK_HIP(hipStreamSynchronize(m_hip_stream));
308 return hipStreamSynchronize(m_hip_stream) != hipSuccess;
312 auto r = hipMemcpyAsync(args.destination().
data(), args.source().
data(),
313 args.source().
bytes().
size(), hipMemcpyDefault, m_hip_stream);
314 ARCCORE_CHECK_HIP(r);
320 auto src = args.source().
bytes();
324 int device = hipCpuDeviceId;
327 auto r = hipMemPrefetchAsync(src.data(), src.size(), device, m_hip_stream);
328 ARCCORE_CHECK_HIP(r);
339 hipStream_t trueStream()
const
347 hipStream_t m_hip_stream;
353class HipRunQueueEvent
358 explicit HipRunQueueEvent(
bool has_timer)
361 ARCCORE_CHECK_HIP(hipEventCreate(&m_hip_event));
363 ARCCORE_CHECK_HIP(hipEventCreateWithFlags(&m_hip_event, hipEventDisableTiming));
365 ~HipRunQueueEvent()
override
367 ARCCORE_CHECK_HIP_NOTHROW(hipEventDestroy(m_hip_event));
376 ARCCORE_CHECK_HIP(hipEventRecord(m_hip_event, rq->trueStream()));
381 ARCCORE_CHECK_HIP(hipEventSynchronize(m_hip_event));
387 ARCCORE_CHECK_HIP(hipStreamWaitEvent(rq->trueStream(), m_hip_event, 0));
390 Int64 elapsedTime(IRunQueueEventImpl* from_event)
final
392 auto* true_from_event =
static_cast<HipRunQueueEvent*
>(from_event);
394 float time_in_ms = 0.0;
395 ARCCORE_CHECK_HIP(hipEventElapsedTime(&time_in_ms, true_from_event->m_hip_event, m_hip_event));
396 double x = time_in_ms * 1.0e6;
401 bool hasPendingWork()
final
403 hipError_t v = hipEventQuery(m_hip_event);
404 if (v == hipErrorNotReady)
406 ARCCORE_CHECK_HIP(v);
412 hipEvent_t m_hip_event;
427 void notifyBeginLaunchKernel()
override
429 ++m_nb_kernel_launched;
431 std::cout <<
"BEGIN HIP KERNEL!\n";
433 void notifyEndLaunchKernel()
override
435 ARCCORE_CHECK_HIP(hipGetLastError());
437 std::cout <<
"END HIP KERNEL!\n";
439 void barrier()
override
441 ARCCORE_CHECK_HIP(hipDeviceSynchronize());
461 auto v = buffer.
bytes();
462 const void* ptr = v.
data();
463 size_t count = v.size();
464 int device = device_id.
asInt32();
465 hipMemoryAdvise hip_advise;
468 hip_advise = hipMemAdviseSetReadMostly;
470 hip_advise = hipMemAdviseSetPreferredLocation;
472 hip_advise = hipMemAdviseSetAccessedBy;
474 hip_advise = hipMemAdviseSetPreferredLocation;
475 device = hipCpuDeviceId;
478 hip_advise = hipMemAdviseSetAccessedBy;
479 device = hipCpuDeviceId;
484 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
488 auto v = buffer.
bytes();
489 const void* ptr = v.
data();
490 size_t count = v.size();
491 int device = device_id.
asInt32();
492 hipMemoryAdvise hip_advise;
495 hip_advise = hipMemAdviseUnsetReadMostly;
497 hip_advise = hipMemAdviseUnsetPreferredLocation;
499 hip_advise = hipMemAdviseUnsetAccessedBy;
501 hip_advise = hipMemAdviseUnsetPreferredLocation;
502 device = hipCpuDeviceId;
505 hip_advise = hipMemAdviseUnsetAccessedBy;
506 device = hipCpuDeviceId;
510 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
513 void setCurrentDevice(
DeviceId device_id)
final
517 ARCCORE_CHECK_HIP(hipSetDevice(
id));
519 const IDeviceInfoList* deviceInfoList()
override {
return &m_device_info_list; }
521 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
523 hipPointerAttribute_t pa;
524 hipError_t ret_value = hipPointerGetAttributes(&pa, ptr);
525 auto mem_type = ePointerMemoryType::Unregistered;
529 if (ret_value==hipSuccess){
530#if HIP_VERSION_MAJOR >= 6
531 auto rocm_memory_type = pa.type;
533 auto rocm_memory_type = pa.memoryType;
536 mem_type = ePointerMemoryType::Managed;
537 else if (rocm_memory_type == hipMemoryTypeHost)
538 mem_type = ePointerMemoryType::Host;
539 else if (rocm_memory_type == hipMemoryTypeDevice)
540 mem_type = ePointerMemoryType::Device;
547 _fillPointerAttribute(attribute, mem_type, pa.device,
548 ptr, pa.devicePointer, pa.hostPointer);
554 int wanted_d = device_id.
asInt32();
555 ARCCORE_CHECK_HIP(hipGetDevice(&d));
557 ARCCORE_CHECK_HIP(hipSetDevice(wanted_d));
559 size_t total_mem = 0;
560 ARCCORE_CHECK_HIP(hipMemGetInfo(&free_mem, &total_mem));
562 ARCCORE_CHECK_HIP(hipSetDevice(d));
564 dmi.setFreeMemory(free_mem);
565 dmi.setTotalMemory(total_mem);
569 void pushProfilerRange(
const String& name, [[maybe_unused]]
Int32 color)
override
571#ifdef ARCCORE_HAS_ROCTX
575 void popProfilerRange()
override
577#ifdef ARCCORE_HAS_ROCTX
584 finalizeHipMemoryAllocators(tm);
588 const void* kernel_ptr,
589 Int64 total_loop_size)
override
597 int nb_block_per_sm = 0;
598 ARCCORE_CHECK_HIP(hipOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
600 int max_block = nb_block_per_sm * m_multi_processor_count;
601 if (nb_block > max_block) {
604 return modified_args;
612 void fillDevices(
bool is_verbose);
616 Int64 m_nb_kernel_launched = 0;
617 bool m_is_verbose =
false;
618 Int32 m_multi_processor_count = 0;
625void HipRunnerRuntime::
626fillDevices(
bool is_verbose)
629 ARCCORE_CHECK_HIP(hipGetDeviceCount(&nb_device));
630 std::ostream& omain = std::cout;
632 omain <<
"ArcaneHIP: Initialize Arcane HIP runtime nb_available_device=" << nb_device <<
"\n";
633 for (
int i = 0; i < nb_device; ++i) {
634 std::ostringstream ostr;
635 std::ostream& o = ostr;
638 ARCCORE_CHECK_HIP(hipGetDeviceProperties(&dp, i));
640 int has_managed_memory = 0;
641 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&has_managed_memory, hipDeviceAttributeManagedMemory, i));
646 int runtime_version = 0;
647 ARCCORE_CHECK_HIP(hipRuntimeGetVersion(&runtime_version));
649 int runtime_major = runtime_version / 10000000;
650 int runtime_minor = (runtime_version / 100000) % 100;
652 int driver_version = 0;
653 ARCCORE_CHECK_HIP(hipDriverGetVersion(&driver_version));
655 int driver_major = driver_version / 10000000;
656 int driver_minor = (driver_version / 100000) % 100;
658 o <<
"\nDevice " << i <<
" name=" << dp.name <<
"\n";
659 o <<
" Driver version = " << driver_major <<
"." << (driver_minor) <<
"." << (driver_version % 100000) <<
"\n";
660 o <<
" Runtime version = " << runtime_major <<
"." << (runtime_minor) <<
"." << (runtime_version % 100000) <<
"\n";
661 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
662 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
663 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
664 o <<
" warpSize = " << dp.warpSize <<
"\n";
665 o <<
" memPitch = " << dp.memPitch <<
"\n";
666 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
667 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
668 o <<
" clockRate = " << dp.clockRate <<
"\n";
670 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
671 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
672 o <<
" integrated = " << dp.integrated <<
"\n";
673 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
674 o <<
" computeMode = " << dp.computeMode <<
"\n";
675 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
676 <<
" " << dp.maxThreadsDim[2] <<
"\n";
677 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
678 <<
" " << dp.maxGridSize[2] <<
"\n";
679 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
680 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
681 o <<
" gcnArchName = " << dp.gcnArchName <<
"\n";
682 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
683 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
684 o <<
" hasManagedMemory = " << has_managed_memory <<
"\n";
685 o <<
" pciInfo = " << dp.pciDomainID <<
" " << dp.pciBusID <<
" " << dp.pciDeviceID <<
"\n";
686#if HIP_VERSION_MAJOR >= 6
687 o <<
" sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor <<
"\n";
688 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
689 o <<
" sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin <<
"\n";
690 o <<
" gpuDirectRDMASupported = " << dp.gpuDirectRDMASupported <<
"\n";
691 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
692 o <<
" unifiedFunctionPointers = " << dp.unifiedFunctionPointers <<
"\n";
698 m_multi_processor_count = dp.multiProcessorCount;
700 std::ostringstream device_uuid_ostr;
703 ARCCORE_CHECK_HIP(hipDeviceGet(&device, i));
705 ARCCORE_CHECK_HIP(hipDeviceGetUuid(&device_uuid, device));
707 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
708 o << device_uuid_ostr.str();
712 String description(ostr.str());
714 omain << description;
717 device_info.setDescription(description);
718 device_info.setDeviceId(
DeviceId(i));
719 device_info.setName(dp.name);
720 device_info.setWarpSize(dp.warpSize);
721 device_info.setUUIDAsString(device_uuid_ostr.str());
722 device_info.setSharedMemoryPerBlock(
static_cast<Int32>(dp.sharedMemPerBlock));
723#if HIP_VERSION_MAJOR >= 6
724 device_info.setSharedMemoryPerMultiprocessor(
static_cast<Int32>(dp.sharedMemPerMultiprocessor));
725 device_info.setSharedMemoryPerBlockOptin(
static_cast<Int32>(dp.sharedMemPerBlockOptin));
727 device_info.setTotalConstMemory(
static_cast<Int32>(dp.totalConstMem));
728 device_info.setPCIDomainID(dp.pciDomainID);
729 device_info.setPCIBusID(dp.pciBusID);
730 device_info.setPCIDeviceID(dp.pciDeviceID);
731 m_device_info_list.addDevice(device_info);
752 ARCCORE_CHECK_HIP(hipMemcpy(to.
data(), from.
data(), from.
bytes().
size(), hipMemcpyDefault));
782extern "C" ARCCORE_EXPORT
void
783arcaneRegisterAcceleratorRuntimehip(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
785 using namespace Arcane::Accelerator::Hip;
786 Arcane::Accelerator::Impl::setUsingHIPRuntime(
true);
787 Arcane::Accelerator::Impl::setHIPRunQueueRuntime(&global_hip_runtime);
788 initializeHipMemoryAllocators();
793 _setAllocator(&unified_memory_hip_memory_allocator);
794 _setAllocator(&host_pinned_hip_memory_allocator);
795 _setAllocator(&device_hip_memory_allocator);
796 mrm->
setCopier(&global_hip_memory_copier);
797 global_hip_runtime.fillDevices(init_info.isVerbose());
#define ARCCORE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCCORE_FATAL_IF(cond,...)
Macro envoyant une exception FatalErrorException si cond est vrai.
Classe de base d'un allocateur spécifique pour accélérateur.
eMemoryResource memoryResource() const final
Ressource mémoire fournie par l'allocateur.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un accélérateur.
Information mémoire d'un accélérateur.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
Interface d'une liste de devices.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
bool isCooperative() const
Indique si on lance en mode coopératif (i.e. cudaLaunchCooperativeKernel)
Int32 nbBlockPerGrid() const
Nombre de blocs de la grille.
void setNbBlockPerGrid(Int32 v)
Nombre de blocs de la grille.
Int32 nbThreadPerBlock() const
Nombre de threads par bloc.
Int32 sharedMemorySize() const
Mémoire partagée à allouer pour le noyau.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Positionne le pool mémoire pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.
Espace de nom de Arccore.