Arcane  v4.1.7.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
HipAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* HipAcceleratorRuntime.cc (C) 2000-2026 */
9/* */
10/* Runtime pour 'HIP'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/HipAccelerator.h"
15
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
18
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
21
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
35
36#include <sstream>
37#include <algorithm>
38
39#ifdef ARCCORE_HAS_ROCTX
40#include <roctx.h>
41#endif
42
43using namespace Arccore;
44
45namespace Arcane::Accelerator::Hip
46{
47using Impl::KernelLaunchArgs;
48
49/*---------------------------------------------------------------------------*/
50/*---------------------------------------------------------------------------*/
51
53{
54 public:
55
56 virtual ~ConcreteAllocator() = default;
57
58 public:
59
60 virtual hipError_t _allocate(void** ptr, size_t new_size) = 0;
61 virtual hipError_t _deallocate(void* ptr) = 0;
62};
63
64/*---------------------------------------------------------------------------*/
65/*---------------------------------------------------------------------------*/
66
67template <typename ConcreteAllocatorType>
68class UnderlyingAllocator
70{
71 public:
72
73 UnderlyingAllocator() = default;
74
75 public:
76
77 void* allocateMemory(size_t size) final
78 {
79 void* out = nullptr;
80 ARCCORE_CHECK_HIP(m_concrete_allocator._allocate(&out, size));
81 return out;
82 }
83 void freeMemory(void* ptr, [[maybe_unused]] size_t size) final
84 {
85 ARCCORE_CHECK_HIP_NOTHROW(m_concrete_allocator._deallocate(ptr));
86 }
87
88 void doMemoryCopy(void* destination, const void* source, Int64 size) final
89 {
90 ARCCORE_CHECK_HIP(hipMemcpy(destination, source, size, hipMemcpyDefault));
91 }
92
93 eMemoryResource memoryResource() const final
94 {
95 return m_concrete_allocator.memoryResource();
96 }
97
98 public:
99
100 ConcreteAllocatorType m_concrete_allocator;
101};
102
103/*---------------------------------------------------------------------------*/
104/*---------------------------------------------------------------------------*/
105
107: public ConcreteAllocator
108{
109 public:
110
111 hipError_t _deallocate(void* ptr) final
112 {
113 return ::hipFree(ptr);
114 }
115
116 hipError_t _allocate(void** ptr, size_t new_size) final
117 {
118 auto r = ::hipMallocManaged(ptr, new_size, hipMemAttachGlobal);
119 return r;
120 }
121
122 constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }
123};
124
125/*---------------------------------------------------------------------------*/
126/*---------------------------------------------------------------------------*/
127
128class UnifiedMemoryHipMemoryAllocator
129: public AcceleratorMemoryAllocatorBase
130{
131 public:
132
133 UnifiedMemoryHipMemoryAllocator()
134 : AcceleratorMemoryAllocatorBase("UnifiedMemoryHipMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())
135 {
136 }
137
138 public:
139
140 void initialize()
141 {
142 _doInitializeUVM(true);
143 }
144};
145
146/*---------------------------------------------------------------------------*/
147/*---------------------------------------------------------------------------*/
148
150: public ConcreteAllocator
151{
152 public:
153
154 hipError_t _allocate(void** ptr, size_t new_size) final
155 {
156 return ::hipHostMalloc(ptr, new_size);
157 }
158 hipError_t _deallocate(void* ptr) final
159 {
160 return ::hipHostFree(ptr);
161 }
162 constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }
163};
164
165/*---------------------------------------------------------------------------*/
166/*---------------------------------------------------------------------------*/
167
168class HostPinnedHipMemoryAllocator
169: public AcceleratorMemoryAllocatorBase
170{
171 public:
172 public:
173
174 HostPinnedHipMemoryAllocator()
175 : AcceleratorMemoryAllocatorBase("HostPinnedHipMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())
176 {
177 }
178
179 public:
180
181 void initialize()
182 {
184 }
185};
186
187/*---------------------------------------------------------------------------*/
188/*---------------------------------------------------------------------------*/
189
190class DeviceConcreteAllocator
191: public ConcreteAllocator
192{
193 public:
194
195 DeviceConcreteAllocator()
196 {
197 }
198
199 hipError_t _allocate(void** ptr, size_t new_size) final
200 {
201 hipError_t r = ::hipMalloc(ptr, new_size);
202 return r;
203 }
204 hipError_t _deallocate(void* ptr) final
205 {
206 return ::hipFree(ptr);
207 }
208
209 constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }
210};
211
212/*---------------------------------------------------------------------------*/
213/*---------------------------------------------------------------------------*/
214
215class DeviceHipMemoryAllocator
216: public AcceleratorMemoryAllocatorBase
217{
218
219 public:
220
221 DeviceHipMemoryAllocator()
222 : AcceleratorMemoryAllocatorBase("DeviceHipMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())
223 {
224 }
225
226 public:
227
228 void initialize()
229 {
231 }
232};
233
234/*---------------------------------------------------------------------------*/
235/*---------------------------------------------------------------------------*/
236
237namespace
238{
239 UnifiedMemoryHipMemoryAllocator unified_memory_hip_memory_allocator;
240 HostPinnedHipMemoryAllocator host_pinned_hip_memory_allocator;
241 DeviceHipMemoryAllocator device_hip_memory_allocator;
242} // namespace
243
244/*---------------------------------------------------------------------------*/
245/*---------------------------------------------------------------------------*/
246
247void initializeHipMemoryAllocators()
248{
249 unified_memory_hip_memory_allocator.initialize();
250 device_hip_memory_allocator.initialize();
251 host_pinned_hip_memory_allocator.initialize();
252}
253
254void finalizeHipMemoryAllocators(ITraceMng* tm)
255{
256 unified_memory_hip_memory_allocator.finalize(tm);
257 device_hip_memory_allocator.finalize(tm);
258 host_pinned_hip_memory_allocator.finalize(tm);
259}
260
261/*---------------------------------------------------------------------------*/
262/*---------------------------------------------------------------------------*/
263
264class HipRunQueueStream
266{
267 public:
268
269 HipRunQueueStream(Impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
270 : m_runtime(runtime)
271 {
272 if (bi.isDefault())
273 ARCCORE_CHECK_HIP(hipStreamCreate(&m_hip_stream));
274 else {
275 int priority = bi.priority();
276 ARCCORE_CHECK_HIP(hipStreamCreateWithPriority(&m_hip_stream, hipStreamDefault, priority));
277 }
278 }
279 ~HipRunQueueStream() override
280 {
281 ARCCORE_CHECK_HIP_NOTHROW(hipStreamDestroy(m_hip_stream));
282 }
283
284 public:
285
286 void notifyBeginLaunchKernel([[maybe_unused]] Impl::RunCommandImpl& c) override
287 {
288#ifdef ARCCORE_HAS_ROCTX
289 auto kname = c.kernelName();
290 if (kname.empty())
291 roctxRangePush(c.traceInfo().name());
292 else
293 roctxRangePush(kname.localstr());
294#endif
295 return m_runtime->notifyBeginLaunchKernel();
296 }
298 {
299#ifdef ARCCORE_HAS_ROCTX
300 roctxRangePop();
301#endif
302 return m_runtime->notifyEndLaunchKernel();
303 }
304 void barrier() override
305 {
306 ARCCORE_CHECK_HIP(hipStreamSynchronize(m_hip_stream));
307 }
308 bool _barrierNoException() override
309 {
310 return hipStreamSynchronize(m_hip_stream) != hipSuccess;
311 }
312 void copyMemory(const MemoryCopyArgs& args) override
313 {
314 auto r = hipMemcpyAsync(args.destination().data(), args.source().data(),
315 args.source().bytes().size(), hipMemcpyDefault, m_hip_stream);
316 ARCCORE_CHECK_HIP(r);
317 if (!args.isAsync())
318 barrier();
319 }
320 void prefetchMemory(const MemoryPrefetchArgs& args) override
321 {
322 auto src = args.source().bytes();
323 if (src.size() == 0)
324 return;
325 DeviceId d = args.deviceId();
326 int device = hipCpuDeviceId;
327 if (!d.isHost())
328 device = d.asInt32();
329 auto r = hipMemPrefetchAsync(src.data(), src.size(), device, m_hip_stream);
330 ARCCORE_CHECK_HIP(r);
331 if (!args.isAsync())
332 barrier();
333 }
335 {
336 return Impl::NativeStream(&m_hip_stream);
337 }
338
339 public:
340
341 hipStream_t trueStream() const
342 {
343 return m_hip_stream;
344 }
345
346 private:
347
348 Impl::IRunnerRuntime* m_runtime;
349 hipStream_t m_hip_stream;
350};
351
352/*---------------------------------------------------------------------------*/
353/*---------------------------------------------------------------------------*/
354
355class HipRunQueueEvent
357{
358 public:
359
360 explicit HipRunQueueEvent(bool has_timer)
361 {
362 if (has_timer)
363 ARCCORE_CHECK_HIP(hipEventCreate(&m_hip_event));
364 else
365 ARCCORE_CHECK_HIP(hipEventCreateWithFlags(&m_hip_event, hipEventDisableTiming));
366 }
367 ~HipRunQueueEvent() override
368 {
369 ARCCORE_CHECK_HIP_NOTHROW(hipEventDestroy(m_hip_event));
370 }
371
372 public:
373
374 // Enregistre l'événement au sein d'une RunQueue
375 void recordQueue(Impl::IRunQueueStream* stream) final
376 {
377 auto* rq = static_cast<HipRunQueueStream*>(stream);
378 ARCCORE_CHECK_HIP(hipEventRecord(m_hip_event, rq->trueStream()));
379 }
380
381 void wait() final
382 {
383 ARCCORE_CHECK_HIP(hipEventSynchronize(m_hip_event));
384 }
385
386 void waitForEvent(Impl::IRunQueueStream* stream) final
387 {
388 auto* rq = static_cast<HipRunQueueStream*>(stream);
389 ARCCORE_CHECK_HIP(hipStreamWaitEvent(rq->trueStream(), m_hip_event, 0));
390 }
391
392 Int64 elapsedTime(IRunQueueEventImpl* from_event) final
393 {
394 auto* true_from_event = static_cast<HipRunQueueEvent*>(from_event);
395 ARCCORE_CHECK_POINTER(true_from_event);
396 float time_in_ms = 0.0;
397 ARCCORE_CHECK_HIP(hipEventElapsedTime(&time_in_ms, true_from_event->m_hip_event, m_hip_event));
398 double x = time_in_ms * 1.0e6;
399 Int64 nano_time = static_cast<Int64>(x);
400 return nano_time;
401 }
402
403 bool hasPendingWork() final
404 {
405 hipError_t v = hipEventQuery(m_hip_event);
406 if (v == hipErrorNotReady)
407 return true;
408 ARCCORE_CHECK_HIP(v);
409 return false;
410 }
411
412 private:
413
414 hipEvent_t m_hip_event;
415};
416
417/*---------------------------------------------------------------------------*/
418/*---------------------------------------------------------------------------*/
419
422{
423 public:
424
425 ~HipRunnerRuntime() override = default;
426
427 public:
428
429 void notifyBeginLaunchKernel() override
430 {
431 ++m_nb_kernel_launched;
432 if (m_is_verbose)
433 std::cout << "BEGIN HIP KERNEL!\n";
434 }
435 void notifyEndLaunchKernel() override
436 {
437 ARCCORE_CHECK_HIP(hipGetLastError());
438 if (m_is_verbose)
439 std::cout << "END HIP KERNEL!\n";
440 }
441 void barrier() override
442 {
443 ARCCORE_CHECK_HIP(hipDeviceSynchronize());
444 }
445 eExecutionPolicy executionPolicy() const override
446 {
448 }
449 Impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
450 {
451 return new HipRunQueueStream(this, bi);
452 }
453 Impl::IRunQueueEventImpl* createEventImpl() override
454 {
455 return new HipRunQueueEvent(false);
456 }
457 Impl::IRunQueueEventImpl* createEventImplWithTimer() override
458 {
459 return new HipRunQueueEvent(true);
460 }
461 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
462 {
463 auto v = buffer.bytes();
464 const void* ptr = v.data();
465 size_t count = v.size();
466 int device = device_id.asInt32();
467 hipMemoryAdvise hip_advise;
468
469 if (advice == eMemoryAdvice::MostlyRead)
470 hip_advise = hipMemAdviseSetReadMostly;
472 hip_advise = hipMemAdviseSetPreferredLocation;
473 else if (advice == eMemoryAdvice::AccessedByDevice)
474 hip_advise = hipMemAdviseSetAccessedBy;
475 else if (advice == eMemoryAdvice::PreferredLocationHost) {
476 hip_advise = hipMemAdviseSetPreferredLocation;
477 device = hipCpuDeviceId;
478 }
479 else if (advice == eMemoryAdvice::AccessedByHost) {
480 hip_advise = hipMemAdviseSetAccessedBy;
481 device = hipCpuDeviceId;
482 }
483 else
484 return;
485 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << hip_advise << " id = " << device << "\n";
486 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
487 }
488 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
489 {
490 auto v = buffer.bytes();
491 const void* ptr = v.data();
492 size_t count = v.size();
493 int device = device_id.asInt32();
494 hipMemoryAdvise hip_advise;
495
496 if (advice == eMemoryAdvice::MostlyRead)
497 hip_advise = hipMemAdviseUnsetReadMostly;
499 hip_advise = hipMemAdviseUnsetPreferredLocation;
500 else if (advice == eMemoryAdvice::AccessedByDevice)
501 hip_advise = hipMemAdviseUnsetAccessedBy;
502 else if (advice == eMemoryAdvice::PreferredLocationHost) {
503 hip_advise = hipMemAdviseUnsetPreferredLocation;
504 device = hipCpuDeviceId;
505 }
506 else if (advice == eMemoryAdvice::AccessedByHost) {
507 hip_advise = hipMemAdviseUnsetAccessedBy;
508 device = hipCpuDeviceId;
509 }
510 else
511 return;
512 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
513 }
514
515 void setCurrentDevice(DeviceId device_id) final
516 {
517 Int32 id = device_id.asInt32();
518 ARCCORE_FATAL_IF(!device_id.isAccelerator(), "Device {0} is not an accelerator device", id);
519 ARCCORE_CHECK_HIP(hipSetDevice(id));
520 }
521 const IDeviceInfoList* deviceInfoList() override { return &m_device_info_list; }
522
523 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
524 {
525 hipPointerAttribute_t pa;
526 hipError_t ret_value = hipPointerGetAttributes(&pa, ptr);
527 auto mem_type = ePointerMemoryType::Unregistered;
528 // Si \a ptr n'a pas été alloué dynamiquement (i.e: il est sur la pile),
529 // hipPointerGetAttribute() retourne une erreur. Dans ce cas on considère
530 // la mémoire comme non enregistrée.
531 if (ret_value == hipSuccess) {
532#if HIP_VERSION_MAJOR >= 6
533 auto rocm_memory_type = pa.type;
534#else
535 auto rocm_memory_type = pa.memoryType;
536#endif
537 if (pa.isManaged)
538 mem_type = ePointerMemoryType::Managed;
539 else if (rocm_memory_type == hipMemoryTypeHost)
540 mem_type = ePointerMemoryType::Host;
541 else if (rocm_memory_type == hipMemoryTypeDevice)
542 mem_type = ePointerMemoryType::Device;
543 }
544
545 //std::cout << "HIP Info: hip_memory_type=" << (int)pa.memoryType << " is_managed?=" << pa.isManaged
546 // << " flags=" << pa.allocationFlags
547 // << " my_memory_type=" << (int)mem_type
548 // << "\n";
549 _fillPointerAttribute(attribute, mem_type, pa.device,
550 ptr, pa.devicePointer, pa.hostPointer);
551 }
552
553 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
554 {
555 int d = 0;
556 int wanted_d = device_id.asInt32();
557 ARCCORE_CHECK_HIP(hipGetDevice(&d));
558 if (d != wanted_d)
559 ARCCORE_CHECK_HIP(hipSetDevice(wanted_d));
560 size_t free_mem = 0;
561 size_t total_mem = 0;
562 ARCCORE_CHECK_HIP(hipMemGetInfo(&free_mem, &total_mem));
563 if (d != wanted_d)
564 ARCCORE_CHECK_HIP(hipSetDevice(d));
566 dmi.setFreeMemory(free_mem);
567 dmi.setTotalMemory(total_mem);
568 return dmi;
569 }
570
571 void pushProfilerRange(const String& name, [[maybe_unused]] Int32 color) override
572 {
573#ifdef ARCCORE_HAS_ROCTX
574 roctxRangePush(name.localstr());
575#endif
576 }
577 void popProfilerRange() override
578 {
579#ifdef ARCCORE_HAS_ROCTX
580 roctxRangePop();
581#endif
582 }
583
584 void finalize(ITraceMng* tm) override
585 {
586 finalizeHipMemoryAllocators(tm);
587 }
588
589 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
590 const void* kernel_ptr,
591 Int64 total_loop_size) override
592 {
593 Int32 shared_memory = orig_args.sharedMemorySize();
594 if (orig_args.isCooperative()) {
595 // En mode coopératif, s'assure qu'on ne lance pas plus de blocs
596 // que le maximum qui peut résider sur le GPU.
597 Int32 nb_thread = orig_args.nbThreadPerBlock();
598 Int32 nb_block = orig_args.nbBlockPerGrid();
599 int nb_block_per_sm = 0;
600 ARCCORE_CHECK_HIP(hipOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
601
602 int max_block = static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);
603 max_block = std::max(max_block, 1);
604 if (nb_block > max_block) {
605 KernelLaunchArgs modified_args(orig_args);
606 modified_args.setNbBlockPerGrid(max_block);
607 return modified_args;
608 }
609 }
610 return orig_args;
611 }
612
613 public:
614
615 void fillDevices(bool is_verbose);
616
617 void build()
618 {
619 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_COOPERATIVE_RATIO", true)) {
620 Int32 x = v.value();
621 x = std::clamp(x, 10, 100);
622 m_cooperative_ratio = x / 100.0;
623 }
624 }
625
626 private:
627
628 Int64 m_nb_kernel_launched = 0;
629 bool m_is_verbose = false;
630 Int32 m_multi_processor_count = 0;
631 double m_cooperative_ratio = 1.0;
632 Impl::DeviceInfoList m_device_info_list;
633};
634
635/*---------------------------------------------------------------------------*/
636/*---------------------------------------------------------------------------*/
637
638void HipRunnerRuntime::
639fillDevices(bool is_verbose)
640{
641 int nb_device = 0;
642 ARCCORE_CHECK_HIP(hipGetDeviceCount(&nb_device));
643 std::ostream& omain = std::cout;
644 if (is_verbose)
645 omain << "ArcaneHIP: Initialize Arcane HIP runtime nb_available_device=" << nb_device << "\n";
646 for (int i = 0; i < nb_device; ++i) {
647 std::ostringstream ostr;
648 std::ostream& o = ostr;
649
650 hipDeviceProp_t dp;
651 ARCCORE_CHECK_HIP(hipGetDeviceProperties(&dp, i));
652
653 int has_managed_memory = 0;
654 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&has_managed_memory, hipDeviceAttributeManagedMemory, i));
655
656 // Le format des versions dans HIP est:
657 // HIP_VERSION = (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
658
659 int runtime_version = 0;
660 ARCCORE_CHECK_HIP(hipRuntimeGetVersion(&runtime_version));
661 //runtime_version /= 10000;
662 int runtime_major = runtime_version / 10000000;
663 int runtime_minor = (runtime_version / 100000) % 100;
664
665 int driver_version = 0;
666 ARCCORE_CHECK_HIP(hipDriverGetVersion(&driver_version));
667 //driver_version /= 10000;
668 int driver_major = driver_version / 10000000;
669 int driver_minor = (driver_version / 100000) % 100;
670
671 o << "\nDevice " << i << " name=" << dp.name << "\n";
672 o << " Driver version = " << driver_major << "." << (driver_minor) << "." << (driver_version % 100000) << "\n";
673 o << " Runtime version = " << runtime_major << "." << (runtime_minor) << "." << (runtime_version % 100000) << "\n";
674 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
675 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
676 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
677 o << " warpSize = " << dp.warpSize << "\n";
678 o << " memPitch = " << dp.memPitch << "\n";
679 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
680 o << " maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor << "\n";
681 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
682 o << " totalConstMem = " << dp.totalConstMem << "\n";
683 o << " clockRate = " << dp.clockRate << "\n";
684 //o << " deviceOverlap = " << dp.deviceOverlap<< "\n";
685 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
686 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
687 o << " integrated = " << dp.integrated << "\n";
688 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
689 o << " computeMode = " << dp.computeMode << "\n";
690 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
691 << " " << dp.maxThreadsDim[2] << "\n";
692 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
693 << " " << dp.maxGridSize[2] << "\n";
694 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
695 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
696 o << " gcnArchName = " << dp.gcnArchName << "\n";
697 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
698 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
699 o << " hasManagedMemory = " << has_managed_memory << "\n";
700 o << " pciInfo = " << dp.pciDomainID << " " << dp.pciBusID << " " << dp.pciDeviceID << "\n";
701 o << " memoryBusWitdh = " << dp.memoryBusWidth << " bits\n";
702
703 int clock_rate = 0;
704 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&clock_rate, hipDeviceAttributeClockRate, i));
705 o << " clockRate = " << (clock_rate / 1000) << " MHz\n";
706
707 int memory_clock_rate = 0;
708 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&memory_clock_rate, hipDeviceAttributeMemoryClockRate, i));
709 o << " memoryClockRate = " << (memory_clock_rate / 1000) << " MHz\n";
710
711 // Sur AMD, la fréquence donnée pour la mémoire doit être multipliée par 8
712 // pour avoir la bande passante d'un bit du bus (comme il faut aussi diviser par 8
713 // pour avoir la valeur en octet, on supprime simplement cette division)
714 Real memory_bandwith = (dp.memoryBusWidth * memory_clock_rate * 2.0) / 1.0e6;
715 o << " MemoryBandwith = " << memory_bandwith << " GB/s\n";
716
717#if HIP_VERSION_MAJOR >= 6
718 o << " sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor << "\n";
719 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
720 o << " sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin << "\n";
721 o << " gpuDirectRDMASupported = " << dp.gpuDirectRDMASupported << "\n";
722 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
723 o << " unifiedFunctionPointers = " << dp.unifiedFunctionPointers << "\n";
724#endif
725
726 // TODO: On suppose que tous les GPUs sont les mêmes et donc
727 // que le nombre de SM par GPU est le même. Cela est utilisé pour
728 // calculer le nombre de blocs en mode coopératif.
729 m_multi_processor_count = dp.multiProcessorCount;
730
731 std::ostringstream device_uuid_ostr;
732 {
733 hipDevice_t device;
734 ARCCORE_CHECK_HIP(hipDeviceGet(&device, i));
735 hipUUID device_uuid;
736 ARCCORE_CHECK_HIP(hipDeviceGetUuid(&device_uuid, device));
737 o << " deviceUuid=";
738 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
739 o << device_uuid_ostr.str();
740 o << "\n";
741 }
742
743 String description(ostr.str());
744 if (is_verbose)
745 omain << description;
746
747 DeviceInfo device_info;
748 device_info.setDescription(description);
749 device_info.setDeviceId(DeviceId(i));
750 device_info.setName(dp.name);
751 device_info.setWarpSize(dp.warpSize);
752 device_info.setUUIDAsString(device_uuid_ostr.str());
753 device_info.setSharedMemoryPerBlock(static_cast<Int32>(dp.sharedMemPerBlock));
754#if HIP_VERSION_MAJOR >= 6
755 device_info.setSharedMemoryPerMultiprocessor(static_cast<Int32>(dp.sharedMemPerMultiprocessor));
756 device_info.setSharedMemoryPerBlockOptin(static_cast<Int32>(dp.sharedMemPerBlockOptin));
757#endif
758 device_info.setTotalConstMemory(static_cast<Int32>(dp.totalConstMem));
759 device_info.setPCIDomainID(dp.pciDomainID);
760 device_info.setPCIBusID(dp.pciBusID);
761 device_info.setPCIDeviceID(dp.pciDeviceID);
762 m_device_info_list.addDevice(device_info);
763 }
764}
765
766/*---------------------------------------------------------------------------*/
767/*---------------------------------------------------------------------------*/
768
770: public IMemoryCopier
771{
772 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,
773 MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,
774 const RunQueue* queue) override
775 {
776 if (queue) {
777 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
778 return;
779 }
780 // 'hipMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
781 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
782 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
783 ARCCORE_CHECK_HIP(hipMemcpy(to.data(), from.data(), from.bytes().size(), hipMemcpyDefault));
784 }
785};
786
787/*---------------------------------------------------------------------------*/
788/*---------------------------------------------------------------------------*/
789
790} // End namespace Arcane::Accelerator::Hip
791
792using namespace Arcane;
793
794namespace
795{
797Arcane::Accelerator::Hip::HipMemoryCopier global_hip_memory_copier;
798
799void _setAllocator(Accelerator::AcceleratorMemoryAllocatorBase* allocator)
800{
802 eMemoryResource mem = allocator->memoryResource();
803 mrm->setAllocator(mem, allocator);
804 mrm->setMemoryPool(mem, allocator->memoryPool());
805}
806}
807
808/*---------------------------------------------------------------------------*/
809/*---------------------------------------------------------------------------*/
810
811// Cette fonction est le point d'entrée utilisé lors du chargement
812// dynamique de cette bibliothèque
813extern "C" ARCCORE_EXPORT void
814arcaneRegisterAcceleratorRuntimehip(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
815{
816 using namespace Arcane::Accelerator::Hip;
817 global_hip_runtime.build();
818 Arcane::Accelerator::Impl::setUsingHIPRuntime(true);
819 Arcane::Accelerator::Impl::setHIPRunQueueRuntime(&global_hip_runtime);
820 initializeHipMemoryAllocators();
822 MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_hip_memory_allocator);
823 IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();
824 mrm->setIsAccelerator(true);
825 _setAllocator(&unified_memory_hip_memory_allocator);
826 _setAllocator(&host_pinned_hip_memory_allocator);
827 _setAllocator(&device_hip_memory_allocator);
828 mrm->setCopier(&global_hip_memory_copier);
829 global_hip_runtime.fillDevices(init_info.isVerbose());
830}
831
832/*---------------------------------------------------------------------------*/
833/*---------------------------------------------------------------------------*/
#define ARCCORE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCCORE_FATAL_IF(cond,...)
Macro envoyant une exception FatalErrorException si cond est vrai.
Classe de base d'un allocateur spécifique pour accélérateur.
eMemoryResource memoryResource() const final
Ressource mémoire fournie par l'allocateur.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
bool isHost() const
Indique si l'instance est associée à l'hôte.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
bool isCooperative() const
Indique si on lance en mode coopératif (i.e. cudaLaunchCooperativeKernel)
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Positionne le pool mémoire pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:537
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:325
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
double Real
Type représentant un réel.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.
Espace de nom de Arccore.