Arcane  v4.1.4.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
HipAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* HipAcceleratorRuntime.cc (C) 2000-2026 */
9/* */
10/* Runtime pour 'HIP'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/HipAccelerator.h"
15
16#include "arccore/base/FatalErrorException.h"
17
18#include "arccore/common/internal/MemoryUtilsInternal.h"
19#include "arccore/common/internal/IMemoryResourceMngInternal.h"
20
21#include "arccore/common/accelerator/RunQueueBuildInfo.h"
22#include "arccore/common/accelerator/Memory.h"
23#include "arccore/common/accelerator/DeviceInfoList.h"
24#include "arccore/common/accelerator/KernelLaunchArgs.h"
25#include "arccore/common/accelerator/RunQueue.h"
26#include "arccore/common/accelerator/DeviceMemoryInfo.h"
27#include "arccore/common/accelerator/NativeStream.h"
28#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
29#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
30#include "arccore/common/accelerator/internal/RunCommandImpl.h"
31#include "arccore/common/accelerator/internal/IRunQueueStream.h"
32#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
33#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
34
35#include <sstream>
36
37#ifdef ARCCORE_HAS_ROCTX
38#include <roctx.h>
39#endif
40
41using namespace Arccore;
42
43namespace Arcane::Accelerator::Hip
44{
45using Impl::KernelLaunchArgs;
46
47/*---------------------------------------------------------------------------*/
48/*---------------------------------------------------------------------------*/
49
51{
52 public:
53
54 virtual ~ConcreteAllocator() = default;
55
56 public:
57
58 virtual hipError_t _allocate(void** ptr, size_t new_size) = 0;
59 virtual hipError_t _deallocate(void* ptr) = 0;
60};
61
62/*---------------------------------------------------------------------------*/
63/*---------------------------------------------------------------------------*/
64
65template <typename ConcreteAllocatorType>
66class UnderlyingAllocator
68{
69 public:
70
71 UnderlyingAllocator() = default;
72
73 public:
74
75 void* allocateMemory(size_t size) final
76 {
77 void* out = nullptr;
78 ARCCORE_CHECK_HIP(m_concrete_allocator._allocate(&out, size));
79 return out;
80 }
81 void freeMemory(void* ptr, [[maybe_unused]] size_t size) final
82 {
83 ARCCORE_CHECK_HIP_NOTHROW(m_concrete_allocator._deallocate(ptr));
84 }
85
86 void doMemoryCopy(void* destination, const void* source, Int64 size) final
87 {
88 ARCCORE_CHECK_HIP(hipMemcpy(destination, source, size, hipMemcpyDefault));
89 }
90
91 eMemoryResource memoryResource() const final
92 {
93 return m_concrete_allocator.memoryResource();
94 }
95
96 public:
97
98 ConcreteAllocatorType m_concrete_allocator;
99};
100
101/*---------------------------------------------------------------------------*/
102/*---------------------------------------------------------------------------*/
103
105: public ConcreteAllocator
106{
107 public:
108
109 hipError_t _deallocate(void* ptr) final
110 {
111 return ::hipFree(ptr);
112 }
113
114 hipError_t _allocate(void** ptr, size_t new_size) final
115 {
116 auto r = ::hipMallocManaged(ptr, new_size, hipMemAttachGlobal);
117 return r;
118 }
119
120 constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }
121};
122
123/*---------------------------------------------------------------------------*/
124/*---------------------------------------------------------------------------*/
125
126class UnifiedMemoryHipMemoryAllocator
127: public AcceleratorMemoryAllocatorBase
128{
129 public:
130
131 UnifiedMemoryHipMemoryAllocator()
132 : AcceleratorMemoryAllocatorBase("UnifiedMemoryHipMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())
133 {
134 }
135
136 public:
137
138 void initialize()
139 {
140 _doInitializeUVM(true);
141 }
142};
143
144/*---------------------------------------------------------------------------*/
145/*---------------------------------------------------------------------------*/
146
148: public ConcreteAllocator
149{
150 public:
151
152 hipError_t _allocate(void** ptr, size_t new_size) final
153 {
154 return ::hipHostMalloc(ptr, new_size);
155 }
156 hipError_t _deallocate(void* ptr) final
157 {
158 return ::hipHostFree(ptr);
159 }
160 constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }
161};
162
163/*---------------------------------------------------------------------------*/
164/*---------------------------------------------------------------------------*/
165
166class HostPinnedHipMemoryAllocator
167: public AcceleratorMemoryAllocatorBase
168{
169 public:
170 public:
171
172 HostPinnedHipMemoryAllocator()
173 : AcceleratorMemoryAllocatorBase("HostPinnedHipMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())
174 {
175 }
176
177 public:
178
179 void initialize()
180 {
182 }
183};
184
185/*---------------------------------------------------------------------------*/
186/*---------------------------------------------------------------------------*/
187
188class DeviceConcreteAllocator
189: public ConcreteAllocator
190{
191 public:
192
193 DeviceConcreteAllocator()
194 {
195 }
196
197 hipError_t _allocate(void** ptr, size_t new_size) final
198 {
199 hipError_t r = ::hipMalloc(ptr, new_size);
200 return r;
201 }
202 hipError_t _deallocate(void* ptr) final
203 {
204 return ::hipFree(ptr);
205 }
206
207 constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }
208};
209
210/*---------------------------------------------------------------------------*/
211/*---------------------------------------------------------------------------*/
212
213class DeviceHipMemoryAllocator
214: public AcceleratorMemoryAllocatorBase
215{
216
217 public:
218
219 DeviceHipMemoryAllocator()
220 : AcceleratorMemoryAllocatorBase("DeviceHipMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())
221 {
222 }
223
224 public:
225
226 void initialize()
227 {
229 }
230};
231
232/*---------------------------------------------------------------------------*/
233/*---------------------------------------------------------------------------*/
234
235namespace
236{
237 UnifiedMemoryHipMemoryAllocator unified_memory_hip_memory_allocator;
238 HostPinnedHipMemoryAllocator host_pinned_hip_memory_allocator;
239 DeviceHipMemoryAllocator device_hip_memory_allocator;
240} // namespace
241
242/*---------------------------------------------------------------------------*/
243/*---------------------------------------------------------------------------*/
244
245void initializeHipMemoryAllocators()
246{
247 unified_memory_hip_memory_allocator.initialize();
248 device_hip_memory_allocator.initialize();
249 host_pinned_hip_memory_allocator.initialize();
250}
251
252void finalizeHipMemoryAllocators(ITraceMng* tm)
253{
254 unified_memory_hip_memory_allocator.finalize(tm);
255 device_hip_memory_allocator.finalize(tm);
256 host_pinned_hip_memory_allocator.finalize(tm);
257}
258
259/*---------------------------------------------------------------------------*/
260/*---------------------------------------------------------------------------*/
261
262class HipRunQueueStream
264{
265 public:
266
267 HipRunQueueStream(Impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
268 : m_runtime(runtime)
269 {
270 if (bi.isDefault())
271 ARCCORE_CHECK_HIP(hipStreamCreate(&m_hip_stream));
272 else {
273 int priority = bi.priority();
274 ARCCORE_CHECK_HIP(hipStreamCreateWithPriority(&m_hip_stream, hipStreamDefault, priority));
275 }
276 }
277 ~HipRunQueueStream() override
278 {
279 ARCCORE_CHECK_HIP_NOTHROW(hipStreamDestroy(m_hip_stream));
280 }
281
282 public:
283
284 void notifyBeginLaunchKernel([[maybe_unused]] Impl::RunCommandImpl& c) override
285 {
286#ifdef ARCCORE_HAS_ROCTX
287 auto kname = c.kernelName();
288 if (kname.empty())
289 roctxRangePush(c.traceInfo().name());
290 else
291 roctxRangePush(kname.localstr());
292#endif
293 return m_runtime->notifyBeginLaunchKernel();
294 }
296 {
297#ifdef ARCCORE_HAS_ROCTX
298 roctxRangePop();
299#endif
300 return m_runtime->notifyEndLaunchKernel();
301 }
302 void barrier() override
303 {
304 ARCCORE_CHECK_HIP(hipStreamSynchronize(m_hip_stream));
305 }
306 bool _barrierNoException() override
307 {
308 return hipStreamSynchronize(m_hip_stream) != hipSuccess;
309 }
310 void copyMemory(const MemoryCopyArgs& args) override
311 {
312 auto r = hipMemcpyAsync(args.destination().data(), args.source().data(),
313 args.source().bytes().size(), hipMemcpyDefault, m_hip_stream);
314 ARCCORE_CHECK_HIP(r);
315 if (!args.isAsync())
316 barrier();
317 }
318 void prefetchMemory(const MemoryPrefetchArgs& args) override
319 {
320 auto src = args.source().bytes();
321 if (src.size()==0)
322 return;
323 DeviceId d = args.deviceId();
324 int device = hipCpuDeviceId;
325 if (!d.isHost())
326 device = d.asInt32();
327 auto r = hipMemPrefetchAsync(src.data(), src.size(), device, m_hip_stream);
328 ARCCORE_CHECK_HIP(r);
329 if (!args.isAsync())
330 barrier();
331 }
333 {
334 return Impl::NativeStream(&m_hip_stream);
335 }
336
337 public:
338
339 hipStream_t trueStream() const
340 {
341 return m_hip_stream;
342 }
343
344 private:
345
346 Impl::IRunnerRuntime* m_runtime;
347 hipStream_t m_hip_stream;
348};
349
350/*---------------------------------------------------------------------------*/
351/*---------------------------------------------------------------------------*/
352
353class HipRunQueueEvent
355{
356 public:
357
358 explicit HipRunQueueEvent(bool has_timer)
359 {
360 if (has_timer)
361 ARCCORE_CHECK_HIP(hipEventCreate(&m_hip_event));
362 else
363 ARCCORE_CHECK_HIP(hipEventCreateWithFlags(&m_hip_event, hipEventDisableTiming));
364 }
365 ~HipRunQueueEvent() override
366 {
367 ARCCORE_CHECK_HIP_NOTHROW(hipEventDestroy(m_hip_event));
368 }
369
370 public:
371
372 // Enregistre l'événement au sein d'une RunQueue
373 void recordQueue(Impl::IRunQueueStream* stream) final
374 {
375 auto* rq = static_cast<HipRunQueueStream*>(stream);
376 ARCCORE_CHECK_HIP(hipEventRecord(m_hip_event, rq->trueStream()));
377 }
378
379 void wait() final
380 {
381 ARCCORE_CHECK_HIP(hipEventSynchronize(m_hip_event));
382 }
383
384 void waitForEvent(Impl::IRunQueueStream* stream) final
385 {
386 auto* rq = static_cast<HipRunQueueStream*>(stream);
387 ARCCORE_CHECK_HIP(hipStreamWaitEvent(rq->trueStream(), m_hip_event, 0));
388 }
389
390 Int64 elapsedTime(IRunQueueEventImpl* from_event) final
391 {
392 auto* true_from_event = static_cast<HipRunQueueEvent*>(from_event);
393 ARCCORE_CHECK_POINTER(true_from_event);
394 float time_in_ms = 0.0;
395 ARCCORE_CHECK_HIP(hipEventElapsedTime(&time_in_ms, true_from_event->m_hip_event, m_hip_event));
396 double x = time_in_ms * 1.0e6;
397 Int64 nano_time = static_cast<Int64>(x);
398 return nano_time;
399 }
400
401 bool hasPendingWork() final
402 {
403 hipError_t v = hipEventQuery(m_hip_event);
404 if (v == hipErrorNotReady)
405 return true;
406 ARCCORE_CHECK_HIP(v);
407 return false;
408 }
409
410 private:
411
412 hipEvent_t m_hip_event;
413};
414
415/*---------------------------------------------------------------------------*/
416/*---------------------------------------------------------------------------*/
417
420{
421 public:
422
423 ~HipRunnerRuntime() override = default;
424
425 public:
426
427 void notifyBeginLaunchKernel() override
428 {
429 ++m_nb_kernel_launched;
430 if (m_is_verbose)
431 std::cout << "BEGIN HIP KERNEL!\n";
432 }
433 void notifyEndLaunchKernel() override
434 {
435 ARCCORE_CHECK_HIP(hipGetLastError());
436 if (m_is_verbose)
437 std::cout << "END HIP KERNEL!\n";
438 }
439 void barrier() override
440 {
441 ARCCORE_CHECK_HIP(hipDeviceSynchronize());
442 }
443 eExecutionPolicy executionPolicy() const override
444 {
446 }
447 Impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
448 {
449 return new HipRunQueueStream(this, bi);
450 }
451 Impl::IRunQueueEventImpl* createEventImpl() override
452 {
453 return new HipRunQueueEvent(false);
454 }
455 Impl::IRunQueueEventImpl* createEventImplWithTimer() override
456 {
457 return new HipRunQueueEvent(true);
458 }
459 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
460 {
461 auto v = buffer.bytes();
462 const void* ptr = v.data();
463 size_t count = v.size();
464 int device = device_id.asInt32();
465 hipMemoryAdvise hip_advise;
466
467 if (advice == eMemoryAdvice::MostlyRead)
468 hip_advise = hipMemAdviseSetReadMostly;
470 hip_advise = hipMemAdviseSetPreferredLocation;
471 else if (advice == eMemoryAdvice::AccessedByDevice)
472 hip_advise = hipMemAdviseSetAccessedBy;
473 else if (advice == eMemoryAdvice::PreferredLocationHost) {
474 hip_advise = hipMemAdviseSetPreferredLocation;
475 device = hipCpuDeviceId;
476 }
477 else if (advice == eMemoryAdvice::AccessedByHost) {
478 hip_advise = hipMemAdviseSetAccessedBy;
479 device = hipCpuDeviceId;
480 }
481 else
482 return;
483 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << hip_advise << " id = " << device << "\n";
484 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
485 }
486 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
487 {
488 auto v = buffer.bytes();
489 const void* ptr = v.data();
490 size_t count = v.size();
491 int device = device_id.asInt32();
492 hipMemoryAdvise hip_advise;
493
494 if (advice == eMemoryAdvice::MostlyRead)
495 hip_advise = hipMemAdviseUnsetReadMostly;
497 hip_advise = hipMemAdviseUnsetPreferredLocation;
498 else if (advice == eMemoryAdvice::AccessedByDevice)
499 hip_advise = hipMemAdviseUnsetAccessedBy;
500 else if (advice == eMemoryAdvice::PreferredLocationHost) {
501 hip_advise = hipMemAdviseUnsetPreferredLocation;
502 device = hipCpuDeviceId;
503 }
504 else if (advice == eMemoryAdvice::AccessedByHost) {
505 hip_advise = hipMemAdviseUnsetAccessedBy;
506 device = hipCpuDeviceId;
507 }
508 else
509 return;
510 ARCCORE_CHECK_HIP(hipMemAdvise(ptr, count, hip_advise, device));
511 }
512
513 void setCurrentDevice(DeviceId device_id) final
514 {
515 Int32 id = device_id.asInt32();
516 ARCCORE_FATAL_IF(!device_id.isAccelerator(), "Device {0} is not an accelerator device", id);
517 ARCCORE_CHECK_HIP(hipSetDevice(id));
518 }
519 const IDeviceInfoList* deviceInfoList() override { return &m_device_info_list; }
520
521 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
522 {
523 hipPointerAttribute_t pa;
524 hipError_t ret_value = hipPointerGetAttributes(&pa, ptr);
525 auto mem_type = ePointerMemoryType::Unregistered;
526 // Si \a ptr n'a pas été alloué dynamiquement (i.e: il est sur la pile),
527 // hipPointerGetAttribute() retourne une erreur. Dans ce cas on considère
528 // la mémoire comme non enregistrée.
529 if (ret_value==hipSuccess){
530#if HIP_VERSION_MAJOR >= 6
531 auto rocm_memory_type = pa.type;
532#else
533 auto rocm_memory_type = pa.memoryType;
534#endif
535 if (pa.isManaged)
536 mem_type = ePointerMemoryType::Managed;
537 else if (rocm_memory_type == hipMemoryTypeHost)
538 mem_type = ePointerMemoryType::Host;
539 else if (rocm_memory_type == hipMemoryTypeDevice)
540 mem_type = ePointerMemoryType::Device;
541 }
542
543 //std::cout << "HIP Info: hip_memory_type=" << (int)pa.memoryType << " is_managed?=" << pa.isManaged
544 // << " flags=" << pa.allocationFlags
545 // << " my_memory_type=" << (int)mem_type
546 // << "\n";
547 _fillPointerAttribute(attribute, mem_type, pa.device,
548 ptr, pa.devicePointer, pa.hostPointer);
549 }
550
551 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
552 {
553 int d = 0;
554 int wanted_d = device_id.asInt32();
555 ARCCORE_CHECK_HIP(hipGetDevice(&d));
556 if (d != wanted_d)
557 ARCCORE_CHECK_HIP(hipSetDevice(wanted_d));
558 size_t free_mem = 0;
559 size_t total_mem = 0;
560 ARCCORE_CHECK_HIP(hipMemGetInfo(&free_mem, &total_mem));
561 if (d != wanted_d)
562 ARCCORE_CHECK_HIP(hipSetDevice(d));
564 dmi.setFreeMemory(free_mem);
565 dmi.setTotalMemory(total_mem);
566 return dmi;
567 }
568
569 void pushProfilerRange(const String& name, [[maybe_unused]] Int32 color) override
570 {
571#ifdef ARCCORE_HAS_ROCTX
572 roctxRangePush(name.localstr());
573#endif
574 }
575 void popProfilerRange() override
576 {
577#ifdef ARCCORE_HAS_ROCTX
578 roctxRangePop();
579#endif
580 }
581
582 void finalize(ITraceMng* tm) override
583 {
584 finalizeHipMemoryAllocators(tm);
585 }
586
587 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
588 const void* kernel_ptr,
589 Int64 total_loop_size) override
590 {
591 Int32 shared_memory = orig_args.sharedMemorySize();
592 if (orig_args.isCooperative()) {
593 // En mode coopératif, s'assure qu'on ne lance pas plus de blocs
594 // que le maximum qui peut résider sur le GPU.
595 Int32 nb_thread = orig_args.nbThreadPerBlock();
596 Int32 nb_block = orig_args.nbBlockPerGrid();
597 int nb_block_per_sm = 0;
598 ARCCORE_CHECK_HIP(hipOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
599
600 int max_block = nb_block_per_sm * m_multi_processor_count;
601 if (nb_block > max_block) {
602 KernelLaunchArgs modified_args(orig_args);
603 modified_args.setNbBlockPerGrid(max_block);
604 return modified_args;
605 }
606 }
607 return orig_args;
608 }
609
610 public:
611
612 void fillDevices(bool is_verbose);
613
614 private:
615
616 Int64 m_nb_kernel_launched = 0;
617 bool m_is_verbose = false;
618 Int32 m_multi_processor_count = 0;
619 Impl::DeviceInfoList m_device_info_list;
620};
621
622/*---------------------------------------------------------------------------*/
623/*---------------------------------------------------------------------------*/
624
625void HipRunnerRuntime::
626fillDevices(bool is_verbose)
627{
628 int nb_device = 0;
629 ARCCORE_CHECK_HIP(hipGetDeviceCount(&nb_device));
630 std::ostream& omain = std::cout;
631 if (is_verbose)
632 omain << "ArcaneHIP: Initialize Arcane HIP runtime nb_available_device=" << nb_device << "\n";
633 for (int i = 0; i < nb_device; ++i) {
634 std::ostringstream ostr;
635 std::ostream& o = ostr;
636
637 hipDeviceProp_t dp;
638 ARCCORE_CHECK_HIP(hipGetDeviceProperties(&dp, i));
639
640 int has_managed_memory = 0;
641 ARCCORE_CHECK_HIP(hipDeviceGetAttribute(&has_managed_memory, hipDeviceAttributeManagedMemory, i));
642
643 // Le format des versions dans HIP est:
644 // HIP_VERSION = (HIP_VERSION_MAJOR * 10000000 + HIP_VERSION_MINOR * 100000 + HIP_VERSION_PATCH)
645
646 int runtime_version = 0;
647 ARCCORE_CHECK_HIP(hipRuntimeGetVersion(&runtime_version));
648 //runtime_version /= 10000;
649 int runtime_major = runtime_version / 10000000;
650 int runtime_minor = (runtime_version / 100000) % 100;
651
652 int driver_version = 0;
653 ARCCORE_CHECK_HIP(hipDriverGetVersion(&driver_version));
654 //driver_version /= 10000;
655 int driver_major = driver_version / 10000000;
656 int driver_minor = (driver_version / 100000) % 100;
657
658 o << "\nDevice " << i << " name=" << dp.name << "\n";
659 o << " Driver version = " << driver_major << "." << (driver_minor) << "." << (driver_version % 100000) << "\n";
660 o << " Runtime version = " << runtime_major << "." << (runtime_minor) << "." << (runtime_version % 100000) << "\n";
661 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
662 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
663 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
664 o << " warpSize = " << dp.warpSize << "\n";
665 o << " memPitch = " << dp.memPitch << "\n";
666 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
667 o << " totalConstMem = " << dp.totalConstMem << "\n";
668 o << " clockRate = " << dp.clockRate << "\n";
669 //o << " deviceOverlap = " << dp.deviceOverlap<< "\n";
670 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
671 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
672 o << " integrated = " << dp.integrated << "\n";
673 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
674 o << " computeMode = " << dp.computeMode << "\n";
675 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
676 << " " << dp.maxThreadsDim[2] << "\n";
677 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
678 << " " << dp.maxGridSize[2] << "\n";
679 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
680 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
681 o << " gcnArchName = " << dp.gcnArchName << "\n";
682 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
683 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
684 o << " hasManagedMemory = " << has_managed_memory << "\n";
685 o << " pciInfo = " << dp.pciDomainID << " " << dp.pciBusID << " " << dp.pciDeviceID << "\n";
686#if HIP_VERSION_MAJOR >= 6
687 o << " sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor << "\n";
688 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
689 o << " sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin << "\n";
690 o << " gpuDirectRDMASupported = " << dp.gpuDirectRDMASupported << "\n";
691 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
692 o << " unifiedFunctionPointers = " << dp.unifiedFunctionPointers << "\n";
693#endif
694
695 // TODO: On suppose que tous les GPUs sont les mêmes et donc
696 // que le nombre de SM par GPU est le même. Cela est utilisé pour
697 // calculer le nombre de blocs en mode coopératif.
698 m_multi_processor_count = dp.multiProcessorCount;
699
700 std::ostringstream device_uuid_ostr;
701 {
702 hipDevice_t device;
703 ARCCORE_CHECK_HIP(hipDeviceGet(&device, i));
704 hipUUID device_uuid;
705 ARCCORE_CHECK_HIP(hipDeviceGetUuid(&device_uuid, device));
706 o << " deviceUuid=";
707 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
708 o << device_uuid_ostr.str();
709 o << "\n";
710 }
711
712 String description(ostr.str());
713 if (is_verbose)
714 omain << description;
715
716 DeviceInfo device_info;
717 device_info.setDescription(description);
718 device_info.setDeviceId(DeviceId(i));
719 device_info.setName(dp.name);
720 device_info.setWarpSize(dp.warpSize);
721 device_info.setUUIDAsString(device_uuid_ostr.str());
722 device_info.setSharedMemoryPerBlock(static_cast<Int32>(dp.sharedMemPerBlock));
723#if HIP_VERSION_MAJOR >= 6
724 device_info.setSharedMemoryPerMultiprocessor(static_cast<Int32>(dp.sharedMemPerMultiprocessor));
725 device_info.setSharedMemoryPerBlockOptin(static_cast<Int32>(dp.sharedMemPerBlockOptin));
726#endif
727 device_info.setTotalConstMemory(static_cast<Int32>(dp.totalConstMem));
728 device_info.setPCIDomainID(dp.pciDomainID);
729 device_info.setPCIBusID(dp.pciBusID);
730 device_info.setPCIDeviceID(dp.pciDeviceID);
731 m_device_info_list.addDevice(device_info);
732 }
733}
734
735/*---------------------------------------------------------------------------*/
736/*---------------------------------------------------------------------------*/
737
739: public IMemoryCopier
740{
741 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,
742 MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,
743 const RunQueue* queue) override
744 {
745 if (queue) {
746 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
747 return;
748 }
749 // 'hipMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
750 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
751 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
752 ARCCORE_CHECK_HIP(hipMemcpy(to.data(), from.data(), from.bytes().size(), hipMemcpyDefault));
753 }
754};
755
756/*---------------------------------------------------------------------------*/
757/*---------------------------------------------------------------------------*/
758
759} // End namespace Arcane::Accelerator::Hip
760
761using namespace Arcane;
762
763namespace
764{
766Arcane::Accelerator::Hip::HipMemoryCopier global_hip_memory_copier;
767
768void _setAllocator(Accelerator::AcceleratorMemoryAllocatorBase* allocator)
769{
771 eMemoryResource mem = allocator->memoryResource();
772 mrm->setAllocator(mem, allocator);
773 mrm->setMemoryPool(mem, allocator->memoryPool());
774}
775}
776
777/*---------------------------------------------------------------------------*/
778/*---------------------------------------------------------------------------*/
779
780// Cette fonction est le point d'entrée utilisé lors du chargement
781// dynamique de cette bibliothèque
782extern "C" ARCCORE_EXPORT void
783arcaneRegisterAcceleratorRuntimehip(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
784{
785 using namespace Arcane::Accelerator::Hip;
786 Arcane::Accelerator::Impl::setUsingHIPRuntime(true);
787 Arcane::Accelerator::Impl::setHIPRunQueueRuntime(&global_hip_runtime);
788 initializeHipMemoryAllocators();
790 MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_hip_memory_allocator);
791 IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();
792 mrm->setIsAccelerator(true);
793 _setAllocator(&unified_memory_hip_memory_allocator);
794 _setAllocator(&host_pinned_hip_memory_allocator);
795 _setAllocator(&device_hip_memory_allocator);
796 mrm->setCopier(&global_hip_memory_copier);
797 global_hip_runtime.fillDevices(init_info.isVerbose());
798}
799
800/*---------------------------------------------------------------------------*/
801/*---------------------------------------------------------------------------*/
#define ARCCORE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCCORE_FATAL_IF(cond,...)
Macro envoyant une exception FatalErrorException si cond est vrai.
Classe de base d'un allocateur spécifique pour accélérateur.
eMemoryResource memoryResource() const final
Ressource mémoire fournie par l'allocateur.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
bool isHost() const
Indique si l'instance est associée à l'hôte.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
bool isCooperative() const
Indique si on lance en mode coopératif (i.e. cudaLaunchCooperativeKernel)
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Positionne le pool mémoire pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:537
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:325
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.
Espace de nom de Arccore.