Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
HipAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* HipAcceleratorRuntime.cc (C) 2000-2026 */
9/* */
10/* Runtime for 'HIP'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/HipAccelerator.h"
15
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
18
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
35
36#include <sstream>
37#include <algorithm>
38
39#ifdef ARCCORE_HAS_ROCTX
40#include <roctx.h>
41#endif
42
43using namespace Arccore;
44
45namespace Arcane::Accelerator::Hip
46{
47using Impl::KernelLaunchArgs;
48
49/*---------------------------------------------------------------------------*/
50/*---------------------------------------------------------------------------*/
51
53{
54 public:
55
56 virtual ~ConcreteAllocator() = default;
57
58 public:
59
60 virtual hipError_t _allocate(void** ptr, size_t new_size) = 0;
61 virtual hipError_t _deallocate(void* ptr) = 0;
62};
63
64/*---------------------------------------------------------------------------*/
65/*---------------------------------------------------------------------------*/
66
67template <typename ConcreteAllocatorType>
68class UnderlyingAllocator
70{
71 public:
72
73 UnderlyingAllocator() = default;
74
75 public:
76
77 void* allocateMemory(Int64 size) final
78 {
79 void* out = nullptr;
80 ARCCORE_CHECK_HIP(m_concrete_allocator._allocate(&out, size));
81 return out;
82 }
83 void freeMemory(void* ptr, [[maybe_unused]] Int64 size) final
84 {
85 ARCCORE_CHECK_HIP_NOTHROW(m_concrete_allocator._deallocate(ptr));
86 }
87
88 void doMemoryCopy(void* destination, const void* source, Int64 size) final
89 {
90 ARCCORE_CHECK_HIP(hipMemcpy(destination, source, size, hipMemcpyDefault));
91 }
92
93 eMemoryResource memoryResource() const final
94 {
95 return m_concrete_allocator.memoryResource();
96 }
97
98 public:
99
100 ConcreteAllocatorType m_concrete_allocator;
101};
102
103/*---------------------------------------------------------------------------*/
104/*---------------------------------------------------------------------------*/
105
107: public ConcreteAllocator
108{
109 public:
110
111 hipError_t _deallocate(void* ptr) final
112 {
113 return ::hipFree(ptr);
114 }
115
116 hipError_t _allocate(void** ptr, size_t new_size) final
117 {
118 auto r = ::hipMallocManaged(ptr, new_size, hipMemAttachGlobal);
119 return r;
120 }
121
122 constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }
123};
124
125/*---------------------------------------------------------------------------*/
126/*---------------------------------------------------------------------------*/
127
128class UnifiedMemoryHipMemoryAllocator
129: public AcceleratorMemoryAllocatorBase
130{
131 public:
132
133 UnifiedMemoryHipMemoryAllocator()
134 : AcceleratorMemoryAllocatorBase("UnifiedMemoryHipMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())
135 {
136 }
137
138 public:
139
140 void initialize()
141 {
142 _doInitializeUVM(true);
143 }
144};
145
146/*---------------------------------------------------------------------------*/
147/*---------------------------------------------------------------------------*/
148
150: public ConcreteAllocator
151{
152 public:
153
154 hipError_t _allocate(void** ptr, size_t new_size) final
155 {
156 return ::hipHostMalloc(ptr, new_size);
157 }
158 hipError_t _deallocate(void* ptr) final
159 {
160 return ::hipHostFree(ptr);
161 }
162 constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }
163};
164
165/*---------------------------------------------------------------------------*/
166/*---------------------------------------------------------------------------*/
167
168class HostPinnedHipMemoryAllocator
169: public AcceleratorMemoryAllocatorBase
170{
171 public:
172 public:
173
174 HostPinnedHipMemoryAllocator()
175 : AcceleratorMemoryAllocatorBase("HostPinnedHipMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())
176 {
177 }
178
179 public:
180
181 void initialize()
182 {
184 }
185};
186
187/*---------------------------------------------------------------------------*/
188/*---------------------------------------------------------------------------*/
189
190class DeviceConcreteAllocator
191: public ConcreteAllocator
192{
193 public:
194
195 DeviceConcreteAllocator()
196 {
197 }
198
199 hipError_t _allocate(void** ptr, size_t new_size) final
200 {
201 hipError_t r = ::hipMalloc(ptr, new_size);
202 return r;
203 }
204 hipError_t _deallocate(void* ptr) final
205 {
206 return ::hipFree(ptr);
207 }
208
209 constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }
210};
211
212/*---------------------------------------------------------------------------*/
213/*---------------------------------------------------------------------------*/
214
215class DeviceHipMemoryAllocator
216: public AcceleratorMemoryAllocatorBase
217{
218
219 public:
220
221 DeviceHipMemoryAllocator()
222 : AcceleratorMemoryAllocatorBase("DeviceHipMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())
223 {
224 }
225
226 public:
227
228 void initialize()
229 {
231 }
232};
233
234/*---------------------------------------------------------------------------*/
235/*---------------------------------------------------------------------------*/
236
237namespace
238{
239 UnifiedMemoryHipMemoryAllocator unified_memory_hip_memory_allocator;
240 HostPinnedHipMemoryAllocator host_pinned_hip_memory_allocator;
241 DeviceHipMemoryAllocator device_hip_memory_allocator;
242} // namespace
243
244/*---------------------------------------------------------------------------*/
245/*---------------------------------------------------------------------------*/
246
247void initializeHipMemoryAllocators()
248{
249 unified_memory_hip_memory_allocator.initialize();
250 device_hip_memory_allocator.initialize();
251 host_pinned_hip_memory_allocator.initialize();
252}
253
254void finalizeHipMemoryAllocators(ITraceMng* tm)
255{
256 unified_memory_hip_memory_allocator.finalize(tm);
257 device_hip_memory_allocator.finalize(tm);
258 host_pinned_hip_memory_allocator.finalize(tm);
259}
260
261/*---------------------------------------------------------------------------*/
262/*---------------------------------------------------------------------------*/
263
264class HipRunQueueStream
266{
267 public:
268
269 HipRunQueueStream(Impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
270 : m_runtime(runtime)
271 {
272 if (bi.isDefault())
273 ARCCORE_CHECK_HIP(hipStreamCreate(&m_hip_stream));
274 else {
275 int priority = bi.priority();
276 ARCCORE_CHECK_HIP(hipStreamCreateWithPriority(&m_hip_stream, hipStreamDefault, priority));
277 }
278 }
279 ~HipRunQueueStream() override
280 {
281 ARCCORE_CHECK_HIP_NOTHROW(hipStreamDestroy(m_hip_stream));
282 }
283
284 public:
285
286 void notifyBeginLaunchKernel([[maybe_unused]] Impl::RunCommandImpl& c) override
287 {
288#ifdef ARCCORE_HAS_ROCTX
289 auto kname = c.kernelName();
290 if (kname.empty())
291 roctxRangePush(c.traceInfo().name());
292 else
293 roctxRangePush(kname.localstr());
294#endif
295 return m_runtime->notifyBeginLaunchKernel();
296 }
298 {
299#ifdef ARCCORE_HAS_ROCTX
300 roctxRangePop();
301#endif
302 return m_runtime->notifyEndLaunchKernel();
303 }
304 void barrier() override
305 {
306 ARCCORE_CHECK_HIP(hipStreamSynchronize(m_hip_stream));
307 }
308 bool _barrierNoException() override
309 {
310 return hipStreamSynchronize(m_hip_stream) != hipSuccess;
311 }
312 void copyMemory(const MemoryCopyArgs& args) override
313 {
314 auto r = hipMemcpyAsync(args.destination().data(), args.source().data(),
315 args.source().bytes().size(), hipMemcpyDefault, m_hip_stream);
316 ARCCORE_CHECK_HIP(r);
317 if (!args.isAsync())
318 barrier();
319 }
320 void prefetchMemory(const MemoryPrefetchArgs& args) override
321 {
322 auto src = args.source().bytes();
323 if (src.size() == 0)
324 return;
325 DeviceId d = args.deviceId();
326 int device = hipCpuDeviceId;
327 if (!d.isHost())
328 device = d.asInt32();
329 auto r = hipMemPrefetchAsync(src.data(), src.size(), device, m_hip_stream);
330 ARCCORE_CHECK_HIP(r);
331 if (!args.isAsync())
332 barrier();
333 }
335 {
336 return Impl::NativeStream(&m_hip_stream);
337 }
338
339 public:
340
341 hipStream_t trueStream() const
342 {
343 return m_hip_stream;
344 }
345
346 private:
347
348 Impl::IRunnerRuntime* m_runtime;
349 hipStream_t m_hip_stream;
350};
351
352/*---------------------------------------------------------------------------*/
353/*---------------------------------------------------------------------------*/
354
355class HipRunQueueEvent
357{
358 public:
359
360 explicit HipRunQueueEvent(bool has_timer)
361 {
362 if (has_timer)
363 ARCCORE_CHECK_HIP(hipEventCreate(&m_hip_event));
364 else
365 ARCCORE_CHECK_HIP(hipEventCreateWithFlags(&m_hip_event, hipEventDisableTiming));
366 }
367 ~HipRunQueueEvent() override
368 {
369 ARCCORE_CHECK_HIP_NOTHROW(hipEventDestroy(m_hip_event));
370 }
371
372 public:
373
374 // Register the event within a RunQueue
375 void recordQueue(Impl::IRunQueueStream* stream) final
376 {
377 auto* rq = static_cast<HipRunQueueStream*>(stream);
378 ARCCORE_CHECK_HIP(hipEventRecord(m_hip_event, rq->trueStream()));
379 }
380
381 void wait() final
382 {
383 ARCCORE_CHECK_HIP(hipEventSynchronize(m_hip_event));
384 }
385
386 void waitForEvent(Impl::IRunQueueStream* stream) final
387 {
388 auto* rq = static_cast<HipRunQueueStream*>(stream);
389 ARCCORE_CHECK_HIP(hipStreamWaitEvent(rq->trueStream(), m_hip_event, 0));
390 }
391
392 Int64 elapsedTime(IRunQueueEventImpl* from_event) final
393 {
394 auto* true_from_event = static_cast<HipRunQueueEvent*>(from_event);
395 ARCCORE_CHECK_POINTER(true_from_event);
396 float time_in_ms = 0.0;
397 ARCCORE_CHECK_HIP(hipEventElapsedTime(&time_in_ms, true_from_event->m_hip_event, m_hip_event));
398 double x = time_in_ms * 1.0e6;
399 Int64 nano_time = static_cast<Int64>(x);
400 return nano_time;
401 }
402
403 bool hasPendingWork() final
404 {
405 hipError_t v = hipEventQuery(m_hip_event);
406 if (v == hipErrorNotReady)
407 return true;
408 ARCCORE_CHECK_HIP(v);
409 return false;
410 }
411
412 private:
413
414 hipEvent_t m_hip_event;
415};
416
417/*---------------------------------------------------------------------------*/
418/*---------------------------------------------------------------------------*/
419
422{
423 public:
424
425 ~HipRunnerRuntime() override = default;
426
427 public:
428
429 void notifyBeginLaunchKernel() override
430 {
431 ++m_nb_kernel_launched;
432 if (m_is_verbose)
433 std::cout << "BEGIN HIP KERNEL!\n";
434 }
435 void notifyEndLaunchKernel() override
436 {
437 ARCCORE_CHECK_HIP(hipGetLastError());
438 if (m_is_verbose)
439 std::cout << "END HIP KERNEL!\n";
440 }
441 void barrier() override
442 {
443 ARCCORE_CHECK_HIP(hipDeviceSynchronize());
444 }
445 eExecutionPolicy executionPolicy() const override
446 {
448 }
449 Impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
450 {
451 return new HipRunQueueStream(this, bi);
452 }
453 Impl::IRunQueueEventImpl* createEventImpl() override
454 {
455 return new HipRunQueueEvent(false);
456 }
457 Impl::IRunQueueEventImpl* createEventImplWithTimer() override
458 {
459 return new HipRunQueueEvent(true);
460 }
461 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
462 {
463 auto v = buffer.bytes();
464 const void* ptr = v.data();
465 size_t count = v.size();
466 int device = device_id.asInt32();
467 hipMemoryAdvise hip_advise;
468
469 if (advice == eMemoryAdvice::MostlyRead)
470 hip_advise = hipMemAdviseSetReadMostly;
472 hip_advise = hipMemAdviseSetPreferredLocation;
473 else if (advice == eMemoryAdvice::AccessedByDevice)
474 hip_advise = hipMemAdviseSetAccessedBy;
475 else if (advice == eMemoryAdvice::PreferredLocationHost) {
476 hip_advise = hipMemAdviseSetPreferredLocation;
477 device = hipCpuDeviceId;
478 }
479 else if (advice == eMemoryAdvice::AccessedByHost) {
480 hip_advise = hipMemAdviseSetAccessedBy;
481 device = hipCpuDeviceId;
482 }
483 else
484 return;
485 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << hip_advise << " id = " << device << "\n";
486 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
487 }
488 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
489 {
490 auto v = buffer.bytes();
491 const void* ptr = v.data();
492 size_t count = v.size();
493 int device = device_id.asInt32();
494 hipMemoryAdvise hip_advise;
495
496 if (advice == eMemoryAdvice::MostlyRead)
497 hip_advise = hipMemAdviseUnsetReadMostly;
499 hip_advise = hipMemAdviseUnsetPreferredLocation;
500 else if (advice == eMemoryAdvice::AccessedByDevice)
501 hip_advise = hipMemAdviseUnsetAccessedBy;
502 else if (advice == eMemoryAdvice::PreferredLocationHost) {
503 hip_advise = hipMemAdviseUnsetPreferredLocation;
504 device = hipCpuDeviceId;
505 }
506 else if (advice == eMemoryAdvice::AccessedByHost) {
507 hip_advise = hipMemAdviseUnsetAccessedBy;
508 device = hipCpuDeviceId;
509 }
510 else
511 return;
512 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
513 }
514
515 void setCurrentDevice(DeviceId device_id) final
516 {
517 Int32 id = device_id.asInt32();
518 ARCCORE_FATAL_IF(!device_id.isAccelerator(), "Device {0} is not an accelerator device", id);
519 ARCCORE_CHECK_HIP(hipSetDevice(id));
520 }
521 const IDeviceInfoList* deviceInfoList() override { return &m_device_info_list; }
522
523 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
524 {
525 hipPointerAttribute_t pa;
526 hipError_t ret_value = hipPointerGetAttributes(&pa, ptr);
527 auto mem_type = ePointerMemoryType::Unregistered;
528 // If ptr has not been dynamically allocated (i.e.: it is on the stack),
529 // hipPointerGetAttribute() returns an error. In this case, we consider
530 // the memory as unregistered.
531 if (ret_value == hipSuccess) {
532#if HIP_VERSION_MAJOR >= 6
533 auto rocm_memory_type = pa.type;
534#else
535 auto rocm_memory_type = pa.memoryType;
536#endif
537 if (pa.isManaged)
538 mem_type = ePointerMemoryType::Managed;
539 else if (rocm_memory_type == hipMemoryTypeHost)
540 mem_type = ePointerMemoryType::Host;
541 else if (rocm_memory_type == hipMemoryTypeDevice)
542 mem_type = ePointerMemoryType::Device;
543 }
544
545 //std::cout << "HIP Info: hip_memory_type=" << (int)pa.memoryType << " is_managed?=" << pa.isManaged
546 // << " flags=" << pa.allocationFlags
547 // << " my_memory_type=" << (int)mem_type
548 // << "\n";
549 _fillPointerAttribute(attribute, mem_type, pa.device,
550 ptr, pa.devicePointer, pa.hostPointer);
551 }
552
553 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
554 {
555 int d = 0;
556 int wanted_d = device_id.asInt32();
557 ARCCORE_CHECK_HIP(hipGetDevice(&d));
558 if (d != wanted_d)
559 ARCCORE_CHECK_HIP(hipSetDevice(wanted_d));
560 size_t free_mem = 0;
561 size_t total_mem = 0;
562 ARCCORE_CHECK_HIP(hipMemGetInfo(&free_mem, &total_mem));
563 if (d != wanted_d)
564 ARCCORE_CHECK_HIP(hipSetDevice(d));
566 dmi.setFreeMemory(free_mem);
567 dmi.setTotalMemory(total_mem);
568 return dmi;
569 }
570
571 void pushProfilerRange(const String& name, [[maybe_unused]] Int32 color) override
572 {
573#ifdef ARCCORE_HAS_ROCTX
574 roctxRangePush(name.localstr());
575#endif
576 }
577 void popProfilerRange() override
578 {
579#ifdef ARCCORE_HAS_ROCTX
580 roctxRangePop();
581#endif
582 }
583
584 void finalize(ITraceMng* tm) override
585 {
586 finalizeHipMemoryAllocators(tm);
587 }
588
589 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
590 const void* kernel_ptr,
591 Int64 total_loop_size) override
592 {
593 Int32 shared_memory = orig_args.sharedMemorySize();
594 if (orig_args.isCooperative()) {
595 // In cooperative mode, ensure that we do not launch more blocks
596 // than the maximum that can reside on the GPU.
597 Int32 nb_thread = orig_args.nbThreadPerBlock();
598 Int32 nb_block = orig_args.nbBlockPerGrid();
599 int nb_block_per_sm = 0;
600 ARCCORE_CHECK_HIP(hipOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
601
602 int max_block = static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);
603 max_block = std::max(max_block, 1);
604 if (nb_block > max_block) {
605 KernelLaunchArgs modified_args(orig_args);
606 modified_args.setNbBlockPerGrid(max_block);
607 return modified_args;
608 }
609 }
610 return orig_args;
611 }
612
613 public:
614
615 void fillDevices(bool is_verbose);
616
617 void build()
618 {
619 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_COOPERATIVE_RATIO", true)) {
620 Int32 x = v.value();
621 x = std::clamp(x, 10, 100);
622 m_cooperative_ratio = x / 100.0;
623 }
624 }
625
626 private:
627
628 Int64 m_nb_kernel_launched = 0;
629 bool m_is_verbose = false;
630 Int32 m_multi_processor_count = 0;
631 double m_cooperative_ratio = 1.0;
632 Impl::DeviceInfoList m_device_info_list;
633};
634
635/*---------------------------------------------------------------------------*/
636/*---------------------------------------------------------------------------*/
637
638void HipRunnerRuntime::
639fillDevices(bool is_verbose)
640{
641 int nb_device = 0;
642 ARCCORE_CHECK_HIP(hipGetDeviceCount(&nb_device));
643 std::ostream& omain = std::cout;
644 if (is_verbose)
645 omain << "ArcaneHIP: Initialize Arcane HIP runtime nb_available_device=" << nb_device << "\n";
646 for (int i = 0; i < nb_device; ++i) {
647 std::ostringstream ostr;
648 std::ostream& o = ostr;
649
650 hipDeviceProp_t dp;
651 ARCCORE_CHECK_HIP(hipGetDeviceProperties(&dp, i));
652
653 int has_managed_memory = 0;
654 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&has_managed_memory, hipDeviceAttributeManagedMemory, i));
655
656 // The format of versions in HIP is:
657 // HIP_VERSION = (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
658
659 int runtime_version = 0;
660 ARCCORE_CHECK_HIP(hipRuntimeGetVersion(&runtime_version));
661 //runtime_version /= 10000;
662 int runtime_major = runtime_version / 10000000;
663 int runtime_minor = (runtime_version / 100000) % 100;
664
665 int driver_version = 0;
666 ARCCORE_CHECK_HIP(hipDriverGetVersion(&driver_version));
667 //driver_version /= 10000;
668 int driver_major = driver_version / 10000000;
669 int driver_minor = (driver_version / 100000) % 100;
670
671 o << "\nDevice " << i << " name=" << dp.name << "\n";
672 o << " Driver version = " << driver_major << "." << (driver_minor) << "." << (driver_version % 100000) << "\n";
673 o << " Runtime version = " << runtime_major << "." << (runtime_minor) << "." << (runtime_version % 100000) << "\n";
674 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
675 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
676 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
677 o << " warpSize = " << dp.warpSize << "\n";
678 o << " memPitch = " << dp.memPitch << "\n";
679 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
680 o << " maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor << "\n";
681 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
682 o << " totalConstMem = " << dp.totalConstMem << "\n";
683 o << " clockRate = " << dp.clockRate << "\n";
684 //o << " deviceOverlap = " << dp.deviceOverlap<< "\n";
685 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
686 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
687 o << " integrated = " << dp.integrated << "\n";
688 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
689 o << " computeMode = " << dp.computeMode << "\n";
690 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
691 << " " << dp.maxThreadsDim[2] << "\n";
692 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
693 << " " << dp.maxGridSize[2] << "\n";
694 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
695 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
696 o << " gcnArchName = " << dp.gcnArchName << "\n";
697 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
698 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
699 o << " hasManagedMemory = " << has_managed_memory << "\n";
700 o << " pciInfo = " << dp.pciDomainID << " " << dp.pciBusID << " " << dp.pciDeviceID << "\n";
701 o << " memoryBusWitdh = " << dp.memoryBusWidth << " bits\n";
702
703 int clock_rate = 0;
704 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeClockRate, i));
705 o << " clockRate = " << (clock_rate / 1000) << " MHz\n";
706
707 int memory_clock_rate = 0;
708 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&memory_clock_rate, hipDeviceAttributeMemoryClockRate, i));
709 o << " memoryClockRate = " << (memory_clock_rate / 1000) << " MHz\n";
710
711 // On AMD, the frequency given for memory must be multiplied by 8
712 // to get the bandwidth of a bit of the bus (since we also have to divide by 8
713 // to get the value in bytes, we simply omit this division)
714 Real memory_bandwith = (dp.memoryBusWidth * memory_clock_rate * 2.0) / 1.0e6;
715 o << " MemoryBandwith = " << memory_bandwith << " GB/s\n";
716
717#if HIP_VERSION_MAJOR >= 6
718 o << " sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor << "\n";
719 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
720 o << " sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin << "\n";
721 o << " gpuDirectRDMASupported = " << dp.gpuDirectRDMASupported << "\n";
722 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
723 o << " unifiedFunctionPointers = " << dp.unifiedFunctionPointers << "\n";
724#endif
725
726 // TODO: We assume that all GPUs are the same and therefore
727 // that the number of SMs per GPU is the same. This is used to
728 // calculate the number of blocks in cooperative mode.
729 m_multi_processor_count = dp.multiProcessorCount;
730
731 std::ostringstream device_uuid_ostr;
732 {
733 hipDevice_t device;
734 ARCCORE_CHECK_HIP(hipDeviceGet(&device, i));
735 hipUUID device_uuid;
736 ARCCORE_CHECK_HIP(hipDeviceGetUuid(&device_uuid, device));
737 o << " deviceUuid=";
738 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
739 o << device_uuid_ostr.str();
740 o << "\n";
741 }
742
743 String description(ostr.str());
744 if (is_verbose)
745 omain << description;
746
747 DeviceInfo device_info;
748 device_info.setDescription(description);
749 device_info.setDeviceId(DeviceId(i));
750 device_info.setName(dp.name);
751 device_info.setWarpSize(dp.warpSize);
752 device_info.setUUIDAsString(device_uuid_ostr.str());
753 device_info.setSharedMemoryPerBlock(static_cast<Int32>(dp.sharedMemPerBlock));
754#if HIP_VERSION_MAJOR >= 6
755 device_info.setSharedMemoryPerMultiprocessor(static_cast<Int32>(dp.sharedMemPerMultiprocessor));
756 device_info.setSharedMemoryPerBlockOptin(static_cast<Int32>(dp.sharedMemPerBlockOptin));
757#endif
758 device_info.setTotalConstMemory(static_cast<Int32>(dp.totalConstMem));
759 device_info.setPCIDomainID(dp.pciDomainID);
760 device_info.setPCIBusID(dp.pciBusID);
761 device_info.setPCIDeviceID(dp.pciDeviceID);
762 m_device_info_list.addDevice(device_info);
763 }
764}
765
766/*---------------------------------------------------------------------------*/
767/*---------------------------------------------------------------------------*/
768
770: public IMemoryCopier
771{
772 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,
773 MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,
774 const RunQueue* queue) override
775 {
776 if (queue) {
777 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
778 return;
779 }
780 // 'hipMemcpyDefault' automatically knows what to do by only considering
781 // the value of the pointers. We should see if
782 // using from_mem and to_mem can improve performance.
783 ARCCORE_CHECK_HIP(hipMemcpy(to.data(), from.data(), from.bytes().size(), hipMemcpyDefault));
784 }
785};
786
787/*---------------------------------------------------------------------------*/
788/*---------------------------------------------------------------------------*/
789
790} // End namespace Arcane::Accelerator::Hip
791
792using namespace Arcane;
793
794namespace
795{
797Arcane::Accelerator::Hip::HipMemoryCopier global_hip_memory_copier;
798
799void _setAllocator(Accelerator::AcceleratorMemoryAllocatorBase* allocator)
800{
802 eMemoryResource mem = allocator->memoryResource();
803 mrm->setAllocator(mem, allocator);
804 mrm->setMemoryPool(mem, allocator->memoryPool());
805}
806} // namespace
807
808/*---------------------------------------------------------------------------*/
809/*---------------------------------------------------------------------------*/
810
811// This function is the entry point used when dynamically loading
812// this library
813extern "C" ARCCORE_EXPORT void
814arcaneRegisterAcceleratorRuntimehip(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
815{
816 using namespace Arcane::Accelerator::Hip;
817 global_hip_runtime.build();
818 Arcane::Accelerator::Impl::setUsingHIPRuntime(true);
819 Arcane::Accelerator::Impl::setHIPRunQueueRuntime(&global_hip_runtime);
820 initializeHipMemoryAllocators();
822 MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_hip_memory_allocator);
823 IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();
824 mrm->setIsAccelerator(true);
825 _setAllocator(&unified_memory_hip_memory_allocator);
826 _setAllocator(&host_pinned_hip_memory_allocator);
827 _setAllocator(&device_hip_memory_allocator);
828 mrm->setCopier(&global_hip_memory_copier);
829 global_hip_runtime.fillDevices(init_info.isVerbose());
830}
831
832/*---------------------------------------------------------------------------*/
833/*---------------------------------------------------------------------------*/
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
#define ARCCORE_FATAL_IF(cond,...)
Macro throwing a FatalErrorException if cond is true.
Base class of a specific allocator for accelerator.
eMemoryResource memoryResource() const final
Memory resource provided by the allocator.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialization for Device memory.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialization for HostPinned memory.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialization for UVM memory.
bool isHost() const
Indicates if the instance is associated with the host.
bool isAccelerator() const
Indicates if the instance is associated with an accelerator.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copies the data from from to to with the queue queue.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification before command launch.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification of command launch completion.
bool _barrierNoException() override
Barrier without exception. Returns true in case of error.
void barrier() override
Blocks until all actions associated with this queue are finished.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Performs a prefetch of a memory region.
void copyMemory(const MemoryCopyArgs &args) override
Performs a copy between two memory regions.
Impl::NativeStream nativeStream() override
Pointer to the internal structure dependent on the implementation.
void freeMemory(void *ptr, Int64 size) final
Frees the block located at address address containing size bytes.
void * allocateMemory(Int64 size) final
Allocates a block for size bytes.
Interface for event implementation.
Interface of an execution stream for a RunQueue.
Interface of the runtime associated with an accelerator.
bool isCooperative() const
Indicates if running in cooperative mode (i.e. cudaLaunchCooperativeKernel).
bool isDefault() const
Indicates if the instance only has default values.
bool isAsync() const
Indicates if the execution queue is asynchronous.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copies information between two memory regions.
Definition RunQueue.cc:237
Constant view on a contiguous memory region containing fixed-size elements.
constexpr SpanType bytes() const
View in byte form.
constexpr const std::byte * data() const
Pointer to the memory region.
Template class for converting a type.
Interface for memory copies with accelerator support.
Internal part of Arcane's 'IMemoryResourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Sets the allocator for resource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Sets the memory pool for resource r.
virtual void setIsAccelerator(bool v)=0
Indicates if an accelerator is available.
virtual void setCopier(IMemoryCopier *copier)=0
Sets the copying instance.
virtual IMemoryResourceMngInternal * _internal()=0
Internal interface.
Mutable view on a contiguous memory region containing fixed-size elements.
constexpr std::byte * data() const
Pointer to the memory region.
constexpr SpanType bytes() const
View in byte form.
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
Definition Span.h:539
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Definition Span.h:327
const char * localstr() const
Returns the conversion of the instance into UTF-8 encoding.
Definition String.cc:229
@ AccessedByHost
Indicates that the memory region is accessed by the host.
@ PreferredLocationDevice
Prefers memory placement on the accelerator.
@ MostlyRead
Indicates that the memory region is primarily read-only.
@ AccessedByDevice
Indicates that the memory region is accessed by the device.
eExecutionPolicy
Execution policy for a Runner.
@ HIP
Execution policy using the HIP environment.
IMemoryRessourceMng * getDataMemoryResourceMng()
Memory resource manager for data.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Sets the specific allocator for accelerators.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Sets the memory resource used for the data memory allocator.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
double Real
Type representing a real number.
eMemoryResource
List of available memory resources.
@ HostPinned
Allocates on the host.
@ UnifiedMemory
Allocates using unified memory.
@ Device
Allocates on the device.
std::int32_t Int32
Signed integer type of 32 bits.
Namespace of Arccore.