14#include "arccore/accelerator_native/CudaAccelerator.h"
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
36#include "arccore/accelerator_native/runtime/Cupti.h"
39#include <unordered_map>
48#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
49#include <nvtx3/nvToolsExt.h>
52namespace Arcane::Accelerator::Cuda
54using Impl::KernelLaunchArgs;
58 Int32 global_cupti_flush = 0;
67#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
69_getMemoryLocation(
int device_id)
71 cudaMemLocation mem_location;
72 mem_location.type = cudaMemLocationTypeDevice;
73 mem_location.id = device_id;
74 if (device_id == cudaCpuDeviceId)
75 mem_location.type = cudaMemLocationTypeHost;
77 mem_location.type = cudaMemLocationTypeDevice;
78 mem_location.id = device_id;
84_getMemoryLocation(
int device_id)
101 virtual cudaError_t _allocate(
void** ptr,
size_t new_size) = 0;
102 virtual cudaError_t _deallocate(
void* ptr) = 0;
108template <
typename ConcreteAllocatorType>
109class UnderlyingAllocator
114 UnderlyingAllocator() =
default;
121 ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));
124 void freeMemory(
void* ptr, [[maybe_unused]]
size_t size)
final
126 ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));
129 void doMemoryCopy(
void* destination,
const void* source,
Int64 size)
final
131 ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));
136 return m_concrete_allocator.memoryResource();
141 ConcreteAllocatorType m_concrete_allocator;
147class UnifiedMemoryConcreteAllocator
152 UnifiedMemoryConcreteAllocator()
155 m_use_ats = v.value();
160 cudaError_t _deallocate(
void* ptr)
final
167 return ::cudaFree(ptr);
170 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
173 *ptr = ::aligned_alloc(128, new_size);
176 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
181 if (r != cudaSuccess)
194 cudaGetDevice(&device_id);
195 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));
196 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));
207 bool m_use_ats =
false;
221class UnifiedMemoryCudaMemoryAllocator
222:
public AcceleratorMemoryAllocatorBase
227 UnifiedMemoryCudaMemoryAllocator()
231 _setTraceLevel(v.value());
244 void* p = ptr.baseAddress();
245 Int64 s = ptr.capacity();
247 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
260 cudaGetDevice(&device_id);
262 auto device_memory_location = _getMemoryLocation(device_id);
263 auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);
267 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));
268 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));
271 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));
275 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));
278 void _removeHint(
void* p,
size_t size, MemoryAllocationArgs args)
284 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));
289 bool m_use_ats =
false;
300 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
302 return ::cudaMallocHost(ptr, new_size);
304 cudaError_t _deallocate(
void* ptr)
final
306 return ::cudaFreeHost(ptr);
314class HostPinnedCudaMemoryAllocator
315:
public AcceleratorMemoryAllocatorBase
320 HostPinnedCudaMemoryAllocator()
336class DeviceConcreteAllocator
341 DeviceConcreteAllocator()
344 m_use_ats = v.value();
347 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
351 *ptr = std::aligned_alloc(128, new_size);
354 return cudaErrorMemoryAllocation;
356 cudaError_t r = ::cudaMalloc(ptr, new_size);
360 cudaError_t _deallocate(
void* ptr)
final
367 return ::cudaFree(ptr);
374 bool m_use_ats =
false;
380class DeviceCudaMemoryAllocator
381:
public AcceleratorMemoryAllocatorBase
386 DeviceCudaMemoryAllocator()
412void initializeCudaMemoryAllocators()
414 unified_memory_cuda_memory_allocator.initialize();
415 device_cuda_memory_allocator.initialize();
416 host_pinned_cuda_memory_allocator.initialize();
419void finalizeCudaMemoryAllocators(
ITraceMng* tm)
421 unified_memory_cuda_memory_allocator.finalize(tm);
422 device_cuda_memory_allocator.finalize(tm);
423 host_pinned_cuda_memory_allocator.finalize(tm);
429void arcaneCheckCudaErrors(
const TraceInfo& ti, CUresult e)
431 if (e == CUDA_SUCCESS)
433 const char* error_name =
nullptr;
434 CUresult e2 = cuGetErrorName(e, &error_name);
435 if (e2 != CUDA_SUCCESS)
436 error_name =
"Unknown";
438 const char* error_message =
nullptr;
439 CUresult e3 = cuGetErrorString(e, &error_message);
440 if (e3 != CUDA_SUCCESS)
441 error_message =
"Unknown";
443 ARCCORE_FATAL(
"CUDA Error trace={0} e={1} name={2} message={3}",
444 ti, e, error_name, error_message);
461 Int32 getNbThreadPerBlock(
const void* kernel_ptr)
463 std::scoped_lock lock(m_mutex);
464 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
465 if (x != m_nb_thread_per_block_map.end())
467 int min_grid_size = 0;
468 int computed_block_size = 0;
469 int wanted_shared_memory = 0;
470 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
471 if (r != cudaSuccess)
472 computed_block_size = 0;
474 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
476 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
478 cudaFuncAttributes func_attr;
479 cudaFuncGetAttributes(&func_attr, kernel_ptr);
480 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
481 std::cout <<
"ComputedBlockSize=" << computed_block_size <<
" n0=" << num_block_0 <<
" n1=" << num_block_1
482 <<
" min_grid_size=" << min_grid_size <<
" nb_reg=" << func_attr.numRegs;
484#if CUDART_VERSION >= 12040
486 const char* func_name =
nullptr;
487 cudaFuncGetName(&func_name, kernel_ptr);
488 std::cout <<
" name=" << func_name <<
"\n";
491 return computed_block_size;
496 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
503class CudaRunQueueStream
512 ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
514 int priority = bi.priority();
515 ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
518 ~CudaRunQueueStream()
override
520 ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
527#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
528 auto kname = c.kernelName();
530 nvtxRangePush(c.traceInfo().name());
532 nvtxRangePush(kname.localstr());
534 return m_runtime->notifyBeginLaunchKernel();
538#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
541 return m_runtime->notifyEndLaunchKernel();
545 ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
546 if (global_cupti_flush > 0)
547 global_cupti_info.flush();
551 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
555 auto source_bytes = args.source().
bytes();
556 auto r = cudaMemcpyAsync(args.destination().
data(), source_bytes.data(),
557 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
558 ARCCORE_CHECK_CUDA(r);
564 auto src = args.source().
bytes();
568 int device = cudaCpuDeviceId;
573 auto mem_location = _getMemoryLocation(device);
574#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
575 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
577 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
579 ARCCORE_CHECK_CUDA(r);
590 cudaStream_t trueStream()
const
592 return m_cuda_stream;
598 cudaStream_t m_cuda_stream =
nullptr;
604class CudaRunQueueEvent
609 explicit CudaRunQueueEvent(
bool has_timer)
612 ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
614 ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
616 ~CudaRunQueueEvent()
override
618 ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
627 ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
632 ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
638 ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
641 Int64 elapsedTime(IRunQueueEventImpl* start_event)
final
645 auto* true_start_event =
static_cast<CudaRunQueueEvent*
>(start_event);
646 float time_in_ms = 0.0;
651 ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
652 double x = time_in_ms * 1.0e6;
657 bool hasPendingWork()
final
659 cudaError_t v = cudaEventQuery(m_cuda_event);
660 if (v == cudaErrorNotReady)
662 ARCCORE_CHECK_CUDA(v);
668 cudaEvent_t m_cuda_event;
683 void notifyBeginLaunchKernel()
override
685 ++m_nb_kernel_launched;
687 std::cout <<
"BEGIN CUDA KERNEL!\n";
689 void notifyEndLaunchKernel()
override
691 ARCCORE_CHECK_CUDA(cudaGetLastError());
693 std::cout <<
"END CUDA KERNEL!\n";
695 void barrier()
override
697 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
717 auto v = buffer.
bytes();
718 const void* ptr = v.
data();
719 size_t count = v.size();
720 int device = device_id.
asInt32();
721 cudaMemoryAdvise cuda_advise;
724 cuda_advise = cudaMemAdviseSetReadMostly;
726 cuda_advise = cudaMemAdviseSetPreferredLocation;
728 cuda_advise = cudaMemAdviseSetAccessedBy;
730 cuda_advise = cudaMemAdviseSetPreferredLocation;
731 device = cudaCpuDeviceId;
734 cuda_advise = cudaMemAdviseSetAccessedBy;
735 device = cudaCpuDeviceId;
740 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
744 auto v = buffer.
bytes();
745 const void* ptr = v.
data();
746 size_t count = v.size();
747 int device = device_id.
asInt32();
748 cudaMemoryAdvise cuda_advise;
751 cuda_advise = cudaMemAdviseUnsetReadMostly;
753 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
755 cuda_advise = cudaMemAdviseUnsetAccessedBy;
757 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
758 device = cudaCpuDeviceId;
761 cuda_advise = cudaMemAdviseUnsetAccessedBy;
762 device = cudaCpuDeviceId;
766 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
769 void setCurrentDevice(
DeviceId device_id)
final
773 ARCCORE_FATAL(
"Device {0} is not an accelerator device",
id);
774 ARCCORE_CHECK_CUDA(cudaSetDevice(
id));
777 const IDeviceInfoList* deviceInfoList()
final {
return &m_device_info_list; }
779 void startProfiling()
override
781 global_cupti_info.start();
784 void stopProfiling()
override
786 global_cupti_info.stop();
789 bool isProfilingActive()
override
791 return global_cupti_info.isActive();
794 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
796 cudaPointerAttributes ca;
797 ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
801 _fillPointerAttribute(attribute, mem_type, ca.device,
802 ptr, ca.devicePointer, ca.hostPointer);
808 int wanted_d = device_id.
asInt32();
809 ARCCORE_CHECK_CUDA(cudaGetDevice(&d));
811 ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));
813 size_t total_mem = 0;
814 ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
816 ARCCORE_CHECK_CUDA(cudaSetDevice(d));
818 dmi.setFreeMemory(free_mem);
819 dmi.setTotalMemory(total_mem);
823 void pushProfilerRange(
const String& name,
Int32 color_rgb)
override
825#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
826 if (color_rgb >= 0) {
829 nvtxEventAttributes_t eventAttrib;
830 std::memset(&eventAttrib, 0,
sizeof(nvtxEventAttributes_t));
831 eventAttrib.version = NVTX_VERSION;
832 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
833 eventAttrib.colorType = NVTX_COLOR_ARGB;
834 eventAttrib.color = color_rgb;
835 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
836 eventAttrib.message.ascii = name.
localstr();
837 nvtxRangePushEx(&eventAttrib);
843 void popProfilerRange()
override
845#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
852 finalizeCudaMemoryAllocators(tm);
856 const void* kernel_ptr,
857 Int64 total_loop_size)
override
865 int nb_block_per_sm = 0;
866 ARCCORE_CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
868 int max_block =
static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);
869 max_block = std::max(max_block, 1);
870 if (nb_block > max_block) {
873 return modified_args;
878 if (!m_use_computed_occupancy)
880 if (shared_memory < 0)
883 if (shared_memory != 0)
885 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
886 if (computed_block_size == 0)
892 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
893 int blocks_per_grid = CheckedConvert::toInt32(big_b);
896 return modified_args;
901 void fillDevices(
bool is_verbose);
905 m_use_computed_occupancy = v.value();
908 x = std::clamp(x, 10, 100);
909 m_cooperative_ratio = x / 100.0;
915 Int64 m_nb_kernel_launched = 0;
916 bool m_is_verbose =
false;
917 bool m_use_computed_occupancy =
false;
918 Int32 m_multi_processor_count = 0;
919 double m_cooperative_ratio = 1.0;
927void CudaRunnerRuntime::
928fillDevices(
bool is_verbose)
931 ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
932 std::ostream& omain = std::cout;
934 omain <<
"ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device <<
"\n";
935 for (
int i = 0; i < nb_device; ++i) {
937 cudaGetDeviceProperties(&dp, i);
938 int runtime_version = 0;
939 cudaRuntimeGetVersion(&runtime_version);
940 int driver_version = 0;
941 cudaDriverGetVersion(&driver_version);
942 std::ostringstream ostr;
943 std::ostream& o = ostr;
944 o <<
"Device " << i <<
" name=" << dp.name <<
"\n";
945 o <<
" Driver version = " << (driver_version / 1000) <<
"." << (driver_version % 1000) <<
"\n";
946 o <<
" Runtime version = " << (runtime_version / 1000) <<
"." << (runtime_version % 1000) <<
"\n";
947 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
948 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
949 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
950 o <<
" sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor <<
"\n";
951 o <<
" sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin <<
"\n";
952 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
953 o <<
" warpSize = " << dp.warpSize <<
"\n";
954 o <<
" memPitch = " << dp.memPitch <<
"\n";
955 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
956 o <<
" maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor <<
"\n";
957 o <<
" maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor <<
"\n";
958 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
959 o <<
" cooperativeLaunch = " << dp.cooperativeLaunch <<
"\n";
960 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
961 o <<
" integrated = " << dp.integrated <<
"\n";
962 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
963 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
964 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
965 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
966 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
967 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
968 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
969 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
970 <<
" " << dp.maxThreadsDim[2] <<
"\n";
971 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
972 <<
" " << dp.maxGridSize[2] <<
"\n";
973 o <<
" pciInfo = " << dp.pciDomainID <<
" " << dp.pciBusID <<
" " << dp.pciDeviceID <<
"\n";
974 o <<
" memoryBusWitdh = " << dp.memoryBusWidth <<
" bits\n";
977 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, i));
978 o <<
" clockRate = " << (clock_rate / 1000) <<
" MHz\n";
980 int memory_clock_rate = 0;
981 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&memory_clock_rate, cudaDevAttrMemoryClockRate, i));
982 o <<
" memoryClockRate = " << (memory_clock_rate / 1000) <<
" MHz\n";
984 Real memory_bandwith = ((dp.memoryBusWidth * memory_clock_rate * 2.0) / 8.0) / 1.0e6;
985 o <<
" MemoryBandwith = " << memory_bandwith <<
" GB/s\n";
987#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)
988 o <<
" deviceOverlap = " << dp.deviceOverlap <<
"\n";
989 o <<
" computeMode = " << dp.computeMode <<
"\n";
990 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
996 m_multi_processor_count = dp.multiProcessorCount;
1000 int greatest_val = 0;
1001 ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
1002 o <<
" leastPriority = " << least_val <<
" greatestPriority = " << greatest_val <<
"\n";
1004 std::ostringstream device_uuid_ostr;
1007 ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));
1009 ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
1010 o <<
" deviceUuid=";
1011 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
1012 o << device_uuid_ostr.str();
1015 String description(ostr.str());
1017 omain << description;
1020 device_info.setDescription(description);
1021 device_info.setDeviceId(
DeviceId(i));
1022 device_info.setName(dp.name);
1023 device_info.setWarpSize(dp.warpSize);
1024 device_info.setUUIDAsString(device_uuid_ostr.str());
1025 device_info.setSharedMemoryPerBlock(
static_cast<Int32>(dp.sharedMemPerBlock));
1026 device_info.setSharedMemoryPerMultiprocessor(
static_cast<Int32>(dp.sharedMemPerMultiprocessor));
1027 device_info.setSharedMemoryPerBlockOptin(
static_cast<Int32>(dp.sharedMemPerBlockOptin));
1028 device_info.setTotalConstMemory(
static_cast<Int32>(dp.totalConstMem));
1029 device_info.setPCIDomainID(dp.pciDomainID);
1030 device_info.setPCIBusID(dp.pciBusID);
1031 device_info.setPCIDeviceID(dp.pciDeviceID);
1032 m_device_info_list.addDevice(device_info);
1035 Int32 global_cupti_level = 0;
1039 global_cupti_level = v.value();
1041 global_cupti_flush = v.value();
1042 bool do_print_cupti =
true;
1044 do_print_cupti = (v.value() != 0);
1046 if (global_cupti_level > 0) {
1047#ifndef ARCCORE_HAS_CUDA_CUPTI
1048 ARCCORE_FATAL(
"Trying to enable CUPTI but Arcane is not compiled with cupti support");
1050 global_cupti_info.init(global_cupti_level, do_print_cupti);
1051 global_cupti_info.start();
1072 ARCCORE_CHECK_CUDA(cudaMemcpy(to.
data(), from.
data(), from.
bytes().
size(), cudaMemcpyDefault));
1103extern "C" ARCCORE_EXPORT
void
1104arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
1106 using namespace Arcane::Accelerator::Cuda;
1107 global_cuda_runtime.build();
1108 Accelerator::Impl::setUsingCUDARuntime(
true);
1109 Accelerator::Impl::setCUDARunQueueRuntime(&global_cuda_runtime);
1110 initializeCudaMemoryAllocators();
1115 _setAllocator(&unified_memory_cuda_memory_allocator);
1116 _setAllocator(&host_pinned_cuda_memory_allocator);
1117 _setAllocator(&device_cuda_memory_allocator);
1118 mrm->
setCopier(&global_cuda_memory_copier);
1119 global_cuda_runtime.fillDevices(init_info.isVerbose());
#define ARCCORE_FATAL(...)
Macro envoyant une exception FatalErrorException.
#define ARCCORE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
Classe de base d'un allocateur spécifique pour accélérateur.
eMemoryResource memoryResource() const final
Ressource mémoire fournie par l'allocateur.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialisation pour la mémoire Device.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialisation pour la mémoire HostPinned.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialisation pour la mémoire UVM.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Map contenant l'occupation idéale pour un kernel donné.
void freeMemory(void *ptr, size_t size) final
Libère le bloc situé à l'adresse address contenant size octets.
void * allocateMemory(size_t size) final
Alloue un bloc pour size octets.
bool m_use_hint_as_mainly_device
Si vrai, par défaut on considère toutes les allocations comme eMemoryLocationHint::MainlyDevice.
Allocateur pour la mémoire unifiée.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifie du changement des arguments spécifiques à l'instance.
Identifiant d'un composant du système.
bool isHost() const
Indique si l'instance est associée à l'hôte.
Int32 asInt32() const
Valeur numérique du device.
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Information sur un accélérateur.
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
bool isCooperative() const
Indique si on lance en mode coopératif (i.e. cudaLaunchCooperativeKernel)
Int32 nbBlockPerGrid() const
Nombre de blocs de la grille.
void setNbThreadPerBlock(Int32 v)
Nombre de threads par bloc.
void setNbBlockPerGrid(Int32 v)
Nombre de blocs de la grille.
Int32 nbThreadPerBlock() const
Nombre de threads par bloc.
Int32 sharedMemorySize() const
Mémoire partagée à allouer pour le noyau.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Arguments pour la copie mémoire.
Arguments pour le préfetching mémoire.
Informations sur une adresse mémoire.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Informations sur une zone mémoire allouée.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Positionne le pool mémoire pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Classe contenant des informations pour spécialiser les allocations.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
eMemoryAdvice
Conseils pour la gestion mémoire.
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
ARCCORE_COMMON_EXPORT IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
ARCCORE_COMMON_EXPORT IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
ARCCORE_COMMON_EXPORT void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
eMemoryLocationHint
Indices sur la localisation mémoire attendue.
@ MainlyHost
Indique que la donnée sera plutôt utilisée sur CPU.
@ HostAndDeviceMostlyRead
Indique que la donnée sera utilisée à la fois sur accélérateur et sur CPU et qu'elle ne sera pas souv...
@ MainlyDevice
Indique que la donnée sera plutôt utilisée sur accélérateur.
double Real
Type représentant un réel.
eMemoryResource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.