14#include "arccore/accelerator_native/CudaAccelerator.h"
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
36#include "arccore/accelerator_native/runtime/Cupti.h"
39#include <unordered_map>
47#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
48#include <nvtx3/nvToolsExt.h>
51namespace Arcane::Accelerator::Cuda
53using Impl::KernelLaunchArgs;
57 Int32 global_cupti_flush = 0;
67#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
69_getMemoryLocation(
int device_id)
71 cudaMemLocation mem_location;
72 mem_location.type = cudaMemLocationTypeDevice;
73 mem_location.id = device_id;
74 if (device_id == cudaCpuDeviceId)
75 mem_location.type = cudaMemLocationTypeHost;
77 mem_location.type = cudaMemLocationTypeDevice;
78 mem_location.id = device_id;
84_getMemoryLocation(
int device_id)
101 virtual cudaError_t _allocate(
void** ptr,
size_t new_size) = 0;
102 virtual cudaError_t _deallocate(
void* ptr) = 0;
108template <
typename ConcreteAllocatorType>
109class UnderlyingAllocator
114 UnderlyingAllocator() =
default;
121 ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));
124 void freeMemory(
void* ptr, [[maybe_unused]]
size_t size)
final
126 ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));
129 void doMemoryCopy(
void* destination,
const void* source,
Int64 size)
final
131 ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));
136 return m_concrete_allocator.memoryResource();
141 ConcreteAllocatorType m_concrete_allocator;
147class UnifiedMemoryConcreteAllocator
152 UnifiedMemoryConcreteAllocator()
155 m_use_ats = v.value();
160 cudaError_t _deallocate(
void* ptr)
final
167 return ::cudaFree(ptr);
170 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
173 *ptr = ::aligned_alloc(128, new_size);
176 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
181 if (r != cudaSuccess)
194 cudaGetDevice(&device_id);
195 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));
196 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));
207 bool m_use_ats =
false;
221class UnifiedMemoryCudaMemoryAllocator
222:
public AcceleratorMemoryAllocatorBase
227 UnifiedMemoryCudaMemoryAllocator()
231 _setTraceLevel(v.value());
244 void* p = ptr.baseAddress();
245 Int64 s = ptr.capacity();
247 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
260 cudaGetDevice(&device_id);
262 auto device_memory_location = _getMemoryLocation(device_id);
263 auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);
267 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));
268 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));
271 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));
275 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));
278 void _removeHint(
void* p,
size_t size, MemoryAllocationArgs args)
284 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));
289 bool m_use_ats =
false;
300 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
302 return ::cudaMallocHost(ptr, new_size);
304 cudaError_t _deallocate(
void* ptr)
final
306 return ::cudaFreeHost(ptr);
314class HostPinnedCudaMemoryAllocator
315:
public AcceleratorMemoryAllocatorBase
320 HostPinnedCudaMemoryAllocator()
336class DeviceConcreteAllocator
341 DeviceConcreteAllocator()
344 m_use_ats = v.value();
347 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
351 *ptr = std::aligned_alloc(128, new_size);
354 return cudaErrorMemoryAllocation;
356 cudaError_t r = ::cudaMalloc(ptr, new_size);
360 cudaError_t _deallocate(
void* ptr)
final
367 return ::cudaFree(ptr);
374 bool m_use_ats =
false;
380class DeviceCudaMemoryAllocator
381:
public AcceleratorMemoryAllocatorBase
386 DeviceCudaMemoryAllocator()
413initializeCudaMemoryAllocators()
415 unified_memory_cuda_memory_allocator.initialize();
416 device_cuda_memory_allocator.initialize();
417 host_pinned_cuda_memory_allocator.initialize();
421finalizeCudaMemoryAllocators(
ITraceMng* tm)
423 unified_memory_cuda_memory_allocator.finalize(tm);
424 device_cuda_memory_allocator.finalize(tm);
425 host_pinned_cuda_memory_allocator.finalize(tm);
432arcaneCheckCudaErrors(
const TraceInfo& ti, CUresult e)
434 if (e == CUDA_SUCCESS)
436 const char* error_name =
nullptr;
437 CUresult e2 = cuGetErrorName(e, &error_name);
438 if (e2 != CUDA_SUCCESS)
439 error_name =
"Unknown";
441 const char* error_message =
nullptr;
442 CUresult e3 = cuGetErrorString(e, &error_message);
443 if (e3 != CUDA_SUCCESS)
444 error_message =
"Unknown";
446 ARCCORE_FATAL(
"CUDA Error trace={0} e={1} name={2} message={3}",
447 ti, e, error_name, error_message);
464 Int32 getNbThreadPerBlock(
const void* kernel_ptr)
466 std::scoped_lock lock(m_mutex);
467 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
468 if (x != m_nb_thread_per_block_map.end())
470 int min_grid_size = 0;
471 int computed_block_size = 0;
472 int wanted_shared_memory = 0;
473 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
474 if (r != cudaSuccess)
475 computed_block_size = 0;
477 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
479 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
481 cudaFuncAttributes func_attr;
482 cudaFuncGetAttributes(&func_attr, kernel_ptr);
483 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
484 std::cout <<
"ComputedBlockSize=" << computed_block_size <<
" n0=" << num_block_0 <<
" n1=" << num_block_1
485 <<
" min_grid_size=" << min_grid_size <<
" nb_reg=" << func_attr.numRegs;
487#if CUDART_VERSION >= 12040
489 const char* func_name =
nullptr;
490 cudaFuncGetName(&func_name, kernel_ptr);
491 std::cout <<
" name=" << func_name <<
"\n";
494 return computed_block_size;
499 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
506class CudaRunQueueStream
515 ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
517 int priority = bi.priority();
518 ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
521 ~CudaRunQueueStream()
override
523 ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
530#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
531 auto kname = c.kernelName();
533 nvtxRangePush(c.traceInfo().name());
535 nvtxRangePush(kname.localstr());
537 return m_runtime->notifyBeginLaunchKernel();
541#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
544 return m_runtime->notifyEndLaunchKernel();
548 ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
549 if (global_cupti_flush > 0)
550 global_cupti_info.flush();
554 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
558 auto source_bytes = args.source().
bytes();
559 auto r = cudaMemcpyAsync(args.destination().
data(), source_bytes.data(),
560 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
561 ARCCORE_CHECK_CUDA(r);
567 auto src = args.source().
bytes();
571 int device = cudaCpuDeviceId;
576 auto mem_location = _getMemoryLocation(device);
577#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
578 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
580 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
582 ARCCORE_CHECK_CUDA(r);
593 cudaStream_t trueStream()
const
595 return m_cuda_stream;
601 cudaStream_t m_cuda_stream =
nullptr;
607class CudaRunQueueEvent
612 explicit CudaRunQueueEvent(
bool has_timer)
615 ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
617 ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
619 ~CudaRunQueueEvent()
override
621 ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
630 ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
635 ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
641 ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
644 Int64 elapsedTime(IRunQueueEventImpl* start_event)
final
647 ARCCORE_CHECK_POINTER(start_event);
648 auto* true_start_event =
static_cast<CudaRunQueueEvent*
>(start_event);
649 float time_in_ms = 0.0;
654 ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
655 double x = time_in_ms * 1.0e6;
660 bool hasPendingWork()
final
662 cudaError_t v = cudaEventQuery(m_cuda_event);
663 if (v == cudaErrorNotReady)
665 ARCCORE_CHECK_CUDA(v);
671 cudaEvent_t m_cuda_event;
686 void notifyBeginLaunchKernel()
override
688 ++m_nb_kernel_launched;
690 std::cout <<
"BEGIN CUDA KERNEL!\n";
692 void notifyEndLaunchKernel()
override
694 ARCCORE_CHECK_CUDA(cudaGetLastError());
696 std::cout <<
"END CUDA KERNEL!\n";
698 void barrier()
override
700 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
720 auto v = buffer.
bytes();
721 const void* ptr = v.
data();
722 size_t count = v.size();
723 int device = device_id.
asInt32();
724 cudaMemoryAdvise cuda_advise;
727 cuda_advise = cudaMemAdviseSetReadMostly;
729 cuda_advise = cudaMemAdviseSetPreferredLocation;
731 cuda_advise = cudaMemAdviseSetAccessedBy;
733 cuda_advise = cudaMemAdviseSetPreferredLocation;
734 device = cudaCpuDeviceId;
737 cuda_advise = cudaMemAdviseSetAccessedBy;
738 device = cudaCpuDeviceId;
743 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
747 auto v = buffer.
bytes();
748 const void* ptr = v.
data();
749 size_t count = v.size();
750 int device = device_id.
asInt32();
751 cudaMemoryAdvise cuda_advise;
754 cuda_advise = cudaMemAdviseUnsetReadMostly;
756 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
758 cuda_advise = cudaMemAdviseUnsetAccessedBy;
760 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
761 device = cudaCpuDeviceId;
764 cuda_advise = cudaMemAdviseUnsetAccessedBy;
765 device = cudaCpuDeviceId;
769 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
772 void setCurrentDevice(
DeviceId device_id)
final
776 ARCCORE_FATAL(
"Device {0} is not an accelerator device",
id);
777 ARCCORE_CHECK_CUDA(cudaSetDevice(
id));
780 const IDeviceInfoList* deviceInfoList()
final {
return &m_device_info_list; }
782 void startProfiling()
override
784 global_cupti_info.start();
787 void stopProfiling()
override
789 global_cupti_info.stop();
792 bool isProfilingActive()
override
794 return global_cupti_info.isActive();
797 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
799 cudaPointerAttributes ca;
800 ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
804 _fillPointerAttribute(attribute, mem_type, ca.device,
805 ptr, ca.devicePointer, ca.hostPointer);
811 int wanted_d = device_id.
asInt32();
812 ARCCORE_CHECK_CUDA(cudaGetDevice(&d));
814 ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));
816 size_t total_mem = 0;
817 ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
819 ARCCORE_CHECK_CUDA(cudaSetDevice(d));
821 dmi.setFreeMemory(free_mem);
822 dmi.setTotalMemory(total_mem);
826 void pushProfilerRange(
const String& name,
Int32 color_rgb)
override
828#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
829 if (color_rgb >= 0) {
832 nvtxEventAttributes_t eventAttrib;
833 std::memset(&eventAttrib, 0,
sizeof(nvtxEventAttributes_t));
834 eventAttrib.version = NVTX_VERSION;
835 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
836 eventAttrib.colorType = NVTX_COLOR_ARGB;
837 eventAttrib.color = color_rgb;
838 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
839 eventAttrib.message.ascii = name.
localstr();
840 nvtxRangePushEx(&eventAttrib);
846 void popProfilerRange()
override
848#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
855 finalizeCudaMemoryAllocators(tm);
859 const void* kernel_ptr,
860 Int64 total_loop_size)
override
862 if (!m_use_computed_occupancy)
864 Int32 wanted_shared_memory = orig_args.sharedMemorySize();
865 if (wanted_shared_memory < 0)
866 wanted_shared_memory = 0;
868 if (wanted_shared_memory != 0)
870 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
871 if (computed_block_size == 0)
873 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
874 int blocks_per_grid = CheckedConvert::toInt32(big_b);
875 return { blocks_per_grid, computed_block_size, wanted_shared_memory };
880 void fillDevices(
bool is_verbose);
884 m_use_computed_occupancy = v.value();
889 Int64 m_nb_kernel_launched = 0;
890 bool m_is_verbose =
false;
891 bool m_use_computed_occupancy =
false;
899void CudaRunnerRuntime::
900fillDevices(
bool is_verbose)
903 ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
904 std::ostream& omain = std::cout;
906 omain <<
"ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device <<
"\n";
907 for (
int i = 0; i < nb_device; ++i) {
909 cudaGetDeviceProperties(&dp, i);
910 int runtime_version = 0;
911 cudaRuntimeGetVersion(&runtime_version);
912 int driver_version = 0;
913 cudaDriverGetVersion(&driver_version);
914 std::ostringstream ostr;
915 std::ostream& o = ostr;
916 o <<
"Device " << i <<
" name=" << dp.name <<
"\n";
917 o <<
" Driver version = " << (driver_version / 1000) <<
"." << (driver_version % 1000) <<
"\n";
918 o <<
" Runtime version = " << (runtime_version / 1000) <<
"." << (runtime_version % 1000) <<
"\n";
919 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
920 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
921 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
922 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
923 o <<
" warpSize = " << dp.warpSize <<
"\n";
924 o <<
" memPitch = " << dp.memPitch <<
"\n";
925 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
926 o <<
" maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor <<
"\n";
927 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
928 o <<
" cooperativeLaunch = " << dp.cooperativeLaunch <<
"\n";
929 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
930 o <<
" integrated = " << dp.integrated <<
"\n";
931 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
932 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
933 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
934 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
935 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
936 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
937 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
938 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
939 <<
" " << dp.maxThreadsDim[2] <<
"\n";
940 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
941 <<
" " << dp.maxGridSize[2] <<
"\n";
942#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)
943 o <<
" clockRate = " << dp.clockRate <<
"\n";
944 o <<
" deviceOverlap = " << dp.deviceOverlap <<
"\n";
945 o <<
" computeMode = " << dp.computeMode <<
"\n";
946 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
951 int greatest_val = 0;
952 ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
953 o <<
" leastPriority = " << least_val <<
" greatestPriority = " << greatest_val <<
"\n";
957 ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));
959 ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
961 impl::printUUID(o, device_uuid.bytes);
964 String description(ostr.str());
966 omain << description;
969 device_info.setDescription(description);
970 device_info.setDeviceId(
DeviceId(i));
971 device_info.setName(dp.name);
972 device_info.setWarpSize(dp.warpSize);
973 m_device_info_list.addDevice(device_info);
976 Int32 global_cupti_level = 0;
980 global_cupti_level = v.value();
982 global_cupti_flush = v.value();
983 bool do_print_cupti =
true;
985 do_print_cupti = (v.value() != 0);
987 if (global_cupti_level > 0) {
988#ifndef ARCCORE_HAS_CUDA_CUPTI
989 ARCCORE_FATAL(
"Trying to enable CUPTI but Arcane is not compiled with cupti support");
991 global_cupti_info.init(global_cupti_level, do_print_cupti);
992 global_cupti_info.start();
1013 ARCCORE_CHECK_CUDA(cudaMemcpy(to.
data(), from.
data(), from.
bytes().
size(), cudaMemcpyDefault));
1033extern "C" ARCCORE_EXPORT
void
1037 using namespace Arcane::Accelerator::Cuda;
1038 global_cuda_runtime.build();
1039 Arcane::Accelerator::impl::setUsingCUDARuntime(
true);
1040 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
1041 initializeCudaMemoryAllocators();
1049 mrm->
setCopier(&global_cuda_memory_copier);
1050 global_cuda_runtime.fillDevices(init_info.isVerbose());
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Map contenant l'occupation idéale pour un kernel donné.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
bool m_use_hint_as_mainly_device
Si vrai, par défaut on considère toutes les allocations comme eMemoryLocationHint::MainlyDevice.
Allocateur pour la mémoire unifiée.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifie du changement des arguments spécifiques à l'instance.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un device.
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Implémentation d'une commande pour accélérateur.
Informations sur une zone mémoire allouée.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
static ARCCORE_BASE_EXPORT std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Classe contenant des informations pour spécialiser les allocations.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryLocationHint
Indices sur la localisation mémoire attendue.
@ MainlyHost
Indique que la donnée sera plutôt utilisée sur CPU.
@ HostAndDeviceMostlyRead
Indique que la donnée sera utilisée à la fois sur accélérateur et sur CPU et qu'elle ne sera pas souv...
@ MainlyDevice
Indique que la donnée sera plutôt utilisée sur accélérateur.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.