14#include "arccore/accelerator_native/HipAccelerator.h"
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
39#ifdef ARCCORE_HAS_ROCTX
45namespace Arcane::Accelerator::Hip
47using Impl::KernelLaunchArgs;
60 virtual hipError_t _allocate(
void** ptr,
size_t new_size) = 0;
61 virtual hipError_t _deallocate(
void* ptr) = 0;
67template <
typename ConcreteAllocatorType>
68class UnderlyingAllocator
73 UnderlyingAllocator() =
default;
80 ARCCORE_CHECK_HIP(m_concrete_allocator._allocate(&out, size));
85 ARCCORE_CHECK_HIP_NOTHROW(m_concrete_allocator._deallocate(ptr));
88 void doMemoryCopy(
void* destination,
const void* source,
Int64 size)
final
90 ARCCORE_CHECK_HIP(hipMemcpy(destination, source, size, hipMemcpyDefault));
95 return m_concrete_allocator.memoryResource();
100 ConcreteAllocatorType m_concrete_allocator;
111 hipError_t _deallocate(
void* ptr)
final
113 return ::hipFree(ptr);
116 hipError_t _allocate(
void** ptr,
size_t new_size)
final
118 auto r = ::hipMallocManaged(ptr, new_size, hipMemAttachGlobal);
128class UnifiedMemoryHipMemoryAllocator
129:
public AcceleratorMemoryAllocatorBase
133 UnifiedMemoryHipMemoryAllocator()
154 hipError_t _allocate(
void** ptr,
size_t new_size)
final
156 return ::hipHostMalloc(ptr, new_size);
158 hipError_t _deallocate(
void* ptr)
final
160 return ::hipHostFree(ptr);
168class HostPinnedHipMemoryAllocator
169:
public AcceleratorMemoryAllocatorBase
174 HostPinnedHipMemoryAllocator()
190class DeviceConcreteAllocator
195 DeviceConcreteAllocator()
199 hipError_t _allocate(
void** ptr,
size_t new_size)
final
201 hipError_t r = ::hipMalloc(ptr, new_size);
204 hipError_t _deallocate(
void* ptr)
final
206 return ::hipFree(ptr);
215class DeviceHipMemoryAllocator
216:
public AcceleratorMemoryAllocatorBase
221 DeviceHipMemoryAllocator()
247void initializeHipMemoryAllocators()
249 unified_memory_hip_memory_allocator.initialize();
250 device_hip_memory_allocator.initialize();
251 host_pinned_hip_memory_allocator.initialize();
254void finalizeHipMemoryAllocators(
ITraceMng* tm)
256 unified_memory_hip_memory_allocator.finalize(tm);
257 device_hip_memory_allocator.finalize(tm);
258 host_pinned_hip_memory_allocator.finalize(tm);
264class HipRunQueueStream
273 ARCCORE_CHECK_HIP(hipStreamCreate(&m_hip_stream));
275 int priority = bi.priority();
276 ARCCORE_CHECK_HIP(hipStreamCreateWithPriority(&m_hip_stream, hipStreamDefault, priority));
279 ~HipRunQueueStream()
override
281 ARCCORE_CHECK_HIP_NOTHROW(hipStreamDestroy(m_hip_stream));
288#ifdef ARCCORE_HAS_ROCTX
289 auto kname = c.kernelName();
291 roctxRangePush(c.traceInfo().name());
293 roctxRangePush(kname.localstr());
295 return m_runtime->notifyBeginLaunchKernel();
299#ifdef ARCCORE_HAS_ROCTX
302 return m_runtime->notifyEndLaunchKernel();
306 ARCCORE_CHECK_HIP(hipStreamSynchronize(m_hip_stream));
310 return hipStreamSynchronize(m_hip_stream) != hipSuccess;
314 auto r = hipMemcpyAsync(args.destination().
data(), args.source().
data(),
315 args.source().
bytes().
size(), hipMemcpyDefault, m_hip_stream);
316 ARCCORE_CHECK_HIP(r);
322 auto src = args.source().
bytes();
326 int device = hipCpuDeviceId;
329 auto r = hipMemPrefetchAsync(src.data(), src.size(), device, m_hip_stream);
330 ARCCORE_CHECK_HIP(r);
341 hipStream_t trueStream()
const
349 hipStream_t m_hip_stream;
355class HipRunQueueEvent
360 explicit HipRunQueueEvent(
bool has_timer)
363 ARCCORE_CHECK_HIP(hipEventCreate(&m_hip_event));
365 ARCCORE_CHECK_HIP(hipEventCreateWithFlags(&m_hip_event, hipEventDisableTiming));
367 ~HipRunQueueEvent()
override
369 ARCCORE_CHECK_HIP_NOTHROW(hipEventDestroy(m_hip_event));
378 ARCCORE_CHECK_HIP(hipEventRecord(m_hip_event, rq->trueStream()));
383 ARCCORE_CHECK_HIP(hipEventSynchronize(m_hip_event));
389 ARCCORE_CHECK_HIP(hipStreamWaitEvent(rq->trueStream(), m_hip_event, 0));
392 Int64 elapsedTime(IRunQueueEventImpl* from_event)
final
394 auto* true_from_event =
static_cast<HipRunQueueEvent*
>(from_event);
396 float time_in_ms = 0.0;
397 ARCCORE_CHECK_HIP(hipEventElapsedTime(&time_in_ms, true_from_event->m_hip_event, m_hip_event));
398 double x = time_in_ms * 1.0e6;
403 bool hasPendingWork()
final
405 hipError_t v = hipEventQuery(m_hip_event);
406 if (v == hipErrorNotReady)
408 ARCCORE_CHECK_HIP(v);
414 hipEvent_t m_hip_event;
429 void notifyBeginLaunchKernel()
override
431 ++m_nb_kernel_launched;
433 std::cout <<
"BEGIN HIP KERNEL!\n";
435 void notifyEndLaunchKernel()
override
437 ARCCORE_CHECK_HIP(hipGetLastError());
439 std::cout <<
"END HIP KERNEL!\n";
441 void barrier()
override
443 ARCCORE_CHECK_HIP(hipDeviceSynchronize());
463 auto v = buffer.
bytes();
464 const void* ptr = v.
data();
465 size_t count = v.size();
466 int device = device_id.
asInt32();
467 hipMemoryAdvise hip_advise;
470 hip_advise = hipMemAdviseSetReadMostly;
472 hip_advise = hipMemAdviseSetPreferredLocation;
474 hip_advise = hipMemAdviseSetAccessedBy;
476 hip_advise = hipMemAdviseSetPreferredLocation;
477 device = hipCpuDeviceId;
480 hip_advise = hipMemAdviseSetAccessedBy;
481 device = hipCpuDeviceId;
486 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
490 auto v = buffer.
bytes();
491 const void* ptr = v.
data();
492 size_t count = v.size();
493 int device = device_id.
asInt32();
494 hipMemoryAdvise hip_advise;
497 hip_advise = hipMemAdviseUnsetReadMostly;
499 hip_advise = hipMemAdviseUnsetPreferredLocation;
501 hip_advise = hipMemAdviseUnsetAccessedBy;
503 hip_advise = hipMemAdviseUnsetPreferredLocation;
504 device = hipCpuDeviceId;
507 hip_advise = hipMemAdviseUnsetAccessedBy;
508 device = hipCpuDeviceId;
512 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
515 void setCurrentDevice(
DeviceId device_id)
final
519 ARCCORE_CHECK_HIP(hipSetDevice(
id));
521 const IDeviceInfoList* deviceInfoList()
override {
return &m_device_info_list; }
523 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
525 hipPointerAttribute_t pa;
526 hipError_t ret_value = hipPointerGetAttributes(&pa, ptr);
527 auto mem_type = ePointerMemoryType::Unregistered;
531 if (ret_value == hipSuccess) {
532#if HIP_VERSION_MAJOR >= 6
533 auto rocm_memory_type = pa.type;
535 auto rocm_memory_type = pa.memoryType;
538 mem_type = ePointerMemoryType::Managed;
539 else if (rocm_memory_type == hipMemoryTypeHost)
540 mem_type = ePointerMemoryType::Host;
541 else if (rocm_memory_type == hipMemoryTypeDevice)
542 mem_type = ePointerMemoryType::Device;
549 _fillPointerAttribute(attribute, mem_type, pa.device,
550 ptr, pa.devicePointer, pa.hostPointer);
556 int wanted_d = device_id.
asInt32();
557 ARCCORE_CHECK_HIP(hipGetDevice(&d));
559 ARCCORE_CHECK_HIP(hipSetDevice(wanted_d));
561 size_t total_mem = 0;
562 ARCCORE_CHECK_HIP(hipMemGetInfo(&free_mem, &total_mem));
564 ARCCORE_CHECK_HIP(hipSetDevice(d));
566 dmi.setFreeMemory(free_mem);
567 dmi.setTotalMemory(total_mem);
571 void pushProfilerRange(
const String& name, [[maybe_unused]]
Int32 color)
override
573#ifdef ARCCORE_HAS_ROCTX
577 void popProfilerRange()
override
579#ifdef ARCCORE_HAS_ROCTX
586 finalizeHipMemoryAllocators(tm);
590 const void* kernel_ptr,
591 Int64 total_loop_size)
override
599 int nb_block_per_sm = 0;
600 ARCCORE_CHECK_HIP(hipOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
602 int max_block =
static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);
603 max_block = std::max(max_block, 1);
604 if (nb_block > max_block) {
607 return modified_args;
615 void fillDevices(
bool is_verbose);
621 x = std::clamp(x, 10, 100);
622 m_cooperative_ratio = x / 100.0;
628 Int64 m_nb_kernel_launched = 0;
629 bool m_is_verbose =
false;
630 Int32 m_multi_processor_count = 0;
631 double m_cooperative_ratio = 1.0;
638void HipRunnerRuntime::
639fillDevices(
bool is_verbose)
642 ARCCORE_CHECK_HIP(hipGetDeviceCount(&nb_device));
643 std::ostream& omain = std::cout;
645 omain <<
"ArcaneHIP: Initialize Arcane HIP runtime nb_available_device=" << nb_device <<
"\n";
646 for (
int i = 0; i < nb_device; ++i) {
647 std::ostringstream ostr;
648 std::ostream& o = ostr;
651 ARCCORE_CHECK_HIP(hipGetDeviceProperties(&dp, i));
653 int has_managed_memory = 0;
654 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&has_managed_memory, hipDeviceAttributeManagedMemory, i));
659 int runtime_version = 0;
660 ARCCORE_CHECK_HIP(hipRuntimeGetVersion(&runtime_version));
662 int runtime_major = runtime_version / 10000000;
663 int runtime_minor = (runtime_version / 100000) % 100;
665 int driver_version = 0;
666 ARCCORE_CHECK_HIP(hipDriverGetVersion(&driver_version));
668 int driver_major = driver_version / 10000000;
669 int driver_minor = (driver_version / 100000) % 100;
671 o <<
"\nDevice " << i <<
" name=" << dp.name <<
"\n";
672 o <<
" Driver version = " << driver_major <<
"." << (driver_minor) <<
"." << (driver_version % 100000) <<
"\n";
673 o <<
" Runtime version = " << runtime_major <<
"." << (runtime_minor) <<
"." << (runtime_version % 100000) <<
"\n";
674 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
675 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
676 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
677 o <<
" warpSize = " << dp.warpSize <<
"\n";
678 o <<
" memPitch = " << dp.memPitch <<
"\n";
679 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
680 o <<
" maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor <<
"\n";
681 o <<
" maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor <<
"\n";
682 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
683 o <<
" clockRate = " << dp.clockRate <<
"\n";
685 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
686 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
687 o <<
" integrated = " << dp.integrated <<
"\n";
688 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
689 o <<
" computeMode = " << dp.computeMode <<
"\n";
690 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
691 <<
" " << dp.maxThreadsDim[2] <<
"\n";
692 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
693 <<
" " << dp.maxGridSize[2] <<
"\n";
694 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
695 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
696 o <<
" gcnArchName = " << dp.gcnArchName <<
"\n";
697 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
698 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
699 o <<
" hasManagedMemory = " << has_managed_memory <<
"\n";
700 o <<
" pciInfo = " << dp.pciDomainID <<
" " << dp.pciBusID <<
" " << dp.pciDeviceID <<
"\n";
701 o <<
" memoryBusWitdh = " << dp.memoryBusWidth <<
" bits\n";
704 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeClockRate, i));
705 o <<
" clockRate = " << (clock_rate / 1000) <<
" MHz\n";
707 int memory_clock_rate = 0;
708 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&memory_clock_rate, hipDeviceAttributeMemoryClockRate, i));
709 o <<
" memoryClockRate = " << (memory_clock_rate / 1000) <<
" MHz\n";
714 Real memory_bandwith = (dp.memoryBusWidth * memory_clock_rate * 2.0) / 1.0e6;
715 o <<
" MemoryBandwith = " << memory_bandwith <<
" GB/s\n";
717#if HIP_VERSION_MAJOR >= 6
718 o <<
" sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor <<
"\n";
719 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
720 o <<
" sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin <<
"\n";
721 o <<
" gpuDirectRDMASupported = " << dp.gpuDirectRDMASupported <<
"\n";
722 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
723 o <<
" unifiedFunctionPointers = " << dp.unifiedFunctionPointers <<
"\n";
729 m_multi_processor_count = dp.multiProcessorCount;
731 std::ostringstream device_uuid_ostr;
734 ARCCORE_CHECK_HIP(hipDeviceGet(&device, i));
736 ARCCORE_CHECK_HIP(hipDeviceGetUuid(&device_uuid, device));
738 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
739 o << device_uuid_ostr.str();
743 String description(ostr.str());
745 omain << description;
748 device_info.setDescription(description);
749 device_info.setDeviceId(
DeviceId(i));
750 device_info.setName(dp.name);
751 device_info.setWarpSize(dp.warpSize);
752 device_info.setUUIDAsString(device_uuid_ostr.str());
753 device_info.setSharedMemoryPerBlock(
static_cast<Int32>(dp.sharedMemPerBlock));
754#if HIP_VERSION_MAJOR >= 6
755 device_info.setSharedMemoryPerMultiprocessor(
static_cast<Int32>(dp.sharedMemPerMultiprocessor));
756 device_info.setSharedMemoryPerBlockOptin(
static_cast<Int32>(dp.sharedMemPerBlockOptin));
758 device_info.setTotalConstMemory(
static_cast<Int32>(dp.totalConstMem));
759 device_info.setPCIDomainID(dp.pciDomainID);
760 device_info.setPCIBusID(dp.pciBusID);
761 device_info.setPCIDeviceID(dp.pciDeviceID);
762 m_device_info_list.addDevice(device_info);
783 ARCCORE_CHECK_HIP(hipMemcpy(to.
data(), from.
data(), from.
bytes().
size(), hipMemcpyDefault));
813extern "C" ARCCORE_EXPORT
void
814arcaneRegisterAcceleratorRuntimehip(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
816 using namespace Arcane::Accelerator::Hip;
817 global_hip_runtime.build();
818 Arcane::Accelerator::Impl::setUsingHIPRuntime(
true);
819 Arcane::Accelerator::Impl::setHIPRunQueueRuntime(&global_hip_runtime);
820 initializeHipMemoryAllocators();
825 _setAllocator(&unified_memory_hip_memory_allocator);
826 _setAllocator(&host_pinned_hip_memory_allocator);
827 _setAllocator(&device_hip_memory_allocator);
828 mrm->
setCopier(&global_hip_memory_copier);
829 global_hip_runtime.fillDevices(init_info.isVerbose());
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
#define ARCCORE_FATAL_IF(cond,...)
Macro throwing a FatalErrorException if cond is true.
Base class of a specific allocator for accelerator.
eMemoryResource memoryResource() const final
Memory resource provided by the allocator.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialization for Device memory.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialization for HostPinned memory.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialization for UVM memory.
Identifier of a system component.
bool isHost() const
Indicates if the instance is associated with the host.
Int32 asInt32() const
Numerical value of the device.
bool isAccelerator() const
Indicates if the instance is associated with an accelerator.
Information about an accelerator.
Accelerator memory information.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copies the data from from to to with the queue queue.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification before command launch.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification of command launch completion.
bool _barrierNoException() override
Barrier without exception. Returns true in case of error.
void barrier() override
Blocks until all actions associated with this queue are finished.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Performs a prefetch of a memory region.
void copyMemory(const MemoryCopyArgs &args) override
Performs a copy between two memory regions.
Impl::NativeStream nativeStream() override
Pointer to the internal structure dependent on the implementation.
void freeMemory(void *ptr, Int64 size) final
Frees the block located at address address containing size bytes.
void * allocateMemory(Int64 size) final
Allocates a block for size bytes.
Interface of a list of devices.
Interface for a list of devices.
Interface for event implementation.
Interface of an execution stream for a RunQueue.
Interface of the runtime associated with an accelerator.
Arguments for launching a kernel.
bool isCooperative() const
Indicates if running in cooperative mode (i.e. cudaLaunchCooperativeKernel).
Int32 nbBlockPerGrid() const
Number of grid blocks.
void setNbBlockPerGrid(Int32 v)
Number of grid blocks.
Int32 nbThreadPerBlock() const
Number of threads per block.
Int32 sharedMemorySize() const
Shared memory to allocate for the kernel.
Opaque type to encapsulate a native 'stream'.
Implementation of a command for accelerator.
Memory prefetching arguments.
Information about a memory address.
Information to create a RunQueue.
bool isDefault() const
Indicates if the instance only has default values.
Execution queue for an accelerator.
bool isAsync() const
Indicates if the execution queue is asynchronous.
void copyMemory(const MemoryCopyArgs &args) const
Copies information between two memory regions.
Constant view on a contiguous memory region containing fixed-size elements.
constexpr SpanType bytes() const
View in byte form.
constexpr const std::byte * data() const
Pointer to the memory region.
Template class for converting a type.
Interface for memory copies with accelerator support.
Internal part of Arcane's 'IMemoryResourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Sets the allocator for resource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Sets the memory pool for resource r.
virtual void setIsAccelerator(bool v)=0
Indicates if an accelerator is available.
virtual void setCopier(IMemoryCopier *copier)=0
Sets the copying instance.
virtual IMemoryResourceMngInternal * _internal()=0
Internal interface.
Mutable view on a contiguous memory region containing fixed-size elements.
constexpr std::byte * data() const
Pointer to the memory region.
constexpr SpanType bytes() const
View in byte form.
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Unicode character string.
const char * localstr() const
Returns the conversion of the instance into UTF-8 encoding.
eMemoryAdvice
Memory management advice.
@ AccessedByHost
Indicates that the memory region is accessed by the host.
@ PreferredLocationDevice
Prefers memory placement on the accelerator.
@ MostlyRead
Indicates that the memory region is primarily read-only.
@ PreferredLocationHost
Prefers memory placement on the host.
@ AccessedByDevice
Indicates that the memory region is accessed by the device.
eExecutionPolicy
Execution policy for a Runner.
@ HIP
Execution policy using the HIP environment.
IMemoryRessourceMng * getDataMemoryResourceMng()
Memory resource manager for data.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Sets the specific allocator for accelerators.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Sets the memory resource used for the data memory allocator.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
double Real
Type representing a real number.
eMemoryResource
List of available memory resources.
@ HostPinned
Allocates on the host.
@ UnifiedMemory
Allocates using unified memory.
@ Device
Allocates on the device.
std::int32_t Int32
Signed integer type of 32 bits.