14#include "arccore/accelerator_native/CudaAccelerator.h"
16#include "arccore/base/CheckedConvert.h"
17#include "arccore/base/FatalErrorException.h"
19#include "arccore/common/internal/MemoryUtilsInternal.h"
20#include "arccore/common/internal/IMemoryResourceMngInternal.h"
22#include "arccore/common/accelerator/RunQueueBuildInfo.h"
23#include "arccore/common/accelerator/Memory.h"
24#include "arccore/common/accelerator/DeviceInfoList.h"
25#include "arccore/common/accelerator/KernelLaunchArgs.h"
26#include "arccore/common/accelerator/RunQueue.h"
27#include "arccore/common/accelerator/DeviceMemoryInfo.h"
28#include "arccore/common/accelerator/NativeStream.h"
29#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
30#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
31#include "arccore/common/accelerator/internal/RunCommandImpl.h"
32#include "arccore/common/accelerator/internal/IRunQueueStream.h"
33#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
34#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"
36#include "arccore/accelerator_native/runtime/Cupti.h"
39#include <unordered_map>
48#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
49#include <nvtx3/nvToolsExt.h>
52namespace Arcane::Accelerator::Cuda
54using Impl::KernelLaunchArgs;
58 Int32 global_cupti_flush = 0;
67#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
69_getMemoryLocation(
int device_id)
71 cudaMemLocation mem_location;
72 mem_location.type = cudaMemLocationTypeDevice;
73 mem_location.id = device_id;
74 if (device_id == cudaCpuDeviceId)
75 mem_location.type = cudaMemLocationTypeHost;
77 mem_location.type = cudaMemLocationTypeDevice;
78 mem_location.id = device_id;
84_getMemoryLocation(
int device_id)
101 virtual cudaError_t _allocate(
void** ptr,
size_t new_size) = 0;
102 virtual cudaError_t _deallocate(
void* ptr) = 0;
108template <
typename ConcreteAllocatorType>
109class UnderlyingAllocator
114 UnderlyingAllocator() =
default;
121 ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));
126 ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));
129 void doMemoryCopy(
void* destination,
const void* source,
Int64 size)
final
131 ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));
136 return m_concrete_allocator.memoryResource();
141 ConcreteAllocatorType m_concrete_allocator;
147class UnifiedMemoryConcreteAllocator
152 UnifiedMemoryConcreteAllocator()
155 m_use_ats = v.value();
160 cudaError_t _deallocate(
void* ptr)
final
167 return ::cudaFree(ptr);
170 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
173 *ptr = ::aligned_alloc(128, new_size);
176 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
181 if (r != cudaSuccess)
195 cudaGetDevice(&device_id);
196 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));
197 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));
208 bool m_use_ats =
false;
224class UnifiedMemoryCudaMemoryAllocator
225:
public AcceleratorMemoryAllocatorBase
230 UnifiedMemoryCudaMemoryAllocator()
234 _setTraceLevel(v.value());
247 void* p = ptr.baseAddress();
248 Int64 s = ptr.capacity();
250 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
263 cudaGetDevice(&device_id);
265 auto device_memory_location = _getMemoryLocation(device_id);
266 auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);
270 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));
271 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));
274 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));
278 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));
281 void _removeHint(
void* p,
size_t size, MemoryAllocationArgs args)
287 ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));
292 bool m_use_ats =
false;
303 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
305 return ::cudaMallocHost(ptr, new_size);
307 cudaError_t _deallocate(
void* ptr)
final
309 return ::cudaFreeHost(ptr);
317class HostPinnedCudaMemoryAllocator
318:
public AcceleratorMemoryAllocatorBase
323 HostPinnedCudaMemoryAllocator()
339class DeviceConcreteAllocator
344 DeviceConcreteAllocator()
347 m_use_ats = v.value();
350 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
354 *ptr = std::aligned_alloc(128, new_size);
357 return cudaErrorMemoryAllocation;
359 cudaError_t r = ::cudaMalloc(ptr, new_size);
363 cudaError_t _deallocate(
void* ptr)
final
370 return ::cudaFree(ptr);
377 bool m_use_ats =
false;
383class DeviceCudaMemoryAllocator
384:
public AcceleratorMemoryAllocatorBase
389 DeviceCudaMemoryAllocator()
415void initializeCudaMemoryAllocators()
417 unified_memory_cuda_memory_allocator.initialize();
418 device_cuda_memory_allocator.initialize();
419 host_pinned_cuda_memory_allocator.initialize();
422void finalizeCudaMemoryAllocators(
ITraceMng* tm)
424 unified_memory_cuda_memory_allocator.finalize(tm);
425 device_cuda_memory_allocator.finalize(tm);
426 host_pinned_cuda_memory_allocator.finalize(tm);
432void arcaneCheckCudaErrors(
const TraceInfo& ti, CUresult e)
434 if (e == CUDA_SUCCESS)
436 const char* error_name =
nullptr;
437 CUresult e2 = cuGetErrorName(e, &error_name);
438 if (e2 != CUDA_SUCCESS)
439 error_name =
"Unknown";
441 const char* error_message =
nullptr;
442 CUresult e3 = cuGetErrorString(e, &error_message);
443 if (e3 != CUDA_SUCCESS)
444 error_message =
"Unknown";
446 ARCCORE_FATAL(
"CUDA Error trace={0} e={1} name={2} message={3}",
447 ti, e, error_name, error_message);
465 Int32 getNbThreadPerBlock(
const void* kernel_ptr)
467 std::scoped_lock lock(m_mutex);
468 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
469 if (x != m_nb_thread_per_block_map.end())
471 int min_grid_size = 0;
472 int computed_block_size = 0;
473 int wanted_shared_memory = 0;
474 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
475 if (r != cudaSuccess)
476 computed_block_size = 0;
478 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
480 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
482 cudaFuncAttributes func_attr;
483 cudaFuncGetAttributes(&func_attr, kernel_ptr);
484 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
485 std::cout <<
"ComputedBlockSize=" << computed_block_size <<
" n0=" << num_block_0 <<
" n1=" << num_block_1
486 <<
" min_grid_size=" << min_grid_size <<
" nb_reg=" << func_attr.numRegs;
488#if CUDART_VERSION >= 12040
490 const char* func_name =
nullptr;
491 cudaFuncGetName(&func_name, kernel_ptr);
492 std::cout <<
" name=" << func_name <<
"\n";
495 return computed_block_size;
500 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
507class CudaRunQueueStream
516 ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
518 int priority = bi.priority();
519 ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
522 ~CudaRunQueueStream()
override
524 ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
531#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
532 auto kname = c.kernelName();
534 nvtxRangePush(c.traceInfo().name());
536 nvtxRangePush(kname.localstr());
538 return m_runtime->notifyBeginLaunchKernel();
542#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
545 return m_runtime->notifyEndLaunchKernel();
549 ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
550 if (global_cupti_flush > 0)
551 global_cupti_info.flush();
555 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
559 auto source_bytes = args.source().
bytes();
560 auto r = cudaMemcpyAsync(args.destination().
data(), source_bytes.data(),
561 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
562 ARCCORE_CHECK_CUDA(r);
568 auto src = args.source().
bytes();
572 int device = cudaCpuDeviceId;
577 auto mem_location = _getMemoryLocation(device);
578#if defined(ARCCORE_USING_CUDA13_OR_GREATER)
579 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
581 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
583 ARCCORE_CHECK_CUDA(r);
594 cudaStream_t trueStream()
const
596 return m_cuda_stream;
602 cudaStream_t m_cuda_stream =
nullptr;
608class CudaRunQueueEvent
613 explicit CudaRunQueueEvent(
bool has_timer)
616 ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
618 ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
620 ~CudaRunQueueEvent()
override
622 ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
631 ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
636 ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
642 ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
645 Int64 elapsedTime(IRunQueueEventImpl* start_event)
final
649 auto* true_start_event =
static_cast<CudaRunQueueEvent*
>(start_event);
650 float time_in_ms = 0.0;
655 ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
656 double x = time_in_ms * 1.0e6;
661 bool hasPendingWork()
final
663 cudaError_t v = cudaEventQuery(m_cuda_event);
664 if (v == cudaErrorNotReady)
666 ARCCORE_CHECK_CUDA(v);
672 cudaEvent_t m_cuda_event;
687 void notifyBeginLaunchKernel()
override
689 ++m_nb_kernel_launched;
691 std::cout <<
"BEGIN CUDA KERNEL!\n";
693 void notifyEndLaunchKernel()
override
695 ARCCORE_CHECK_CUDA(cudaGetLastError());
697 std::cout <<
"END CUDA KERNEL!\n";
699 void barrier()
override
701 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
721 auto v = buffer.
bytes();
722 const void* ptr = v.
data();
723 size_t count = v.size();
724 int device = device_id.
asInt32();
725 cudaMemoryAdvise cuda_advise;
728 cuda_advise = cudaMemAdviseSetReadMostly;
730 cuda_advise = cudaMemAdviseSetPreferredLocation;
732 cuda_advise = cudaMemAdviseSetAccessedBy;
734 cuda_advise = cudaMemAdviseSetPreferredLocation;
735 device = cudaCpuDeviceId;
738 cuda_advise = cudaMemAdviseSetAccessedBy;
739 device = cudaCpuDeviceId;
744 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
748 auto v = buffer.
bytes();
749 const void* ptr = v.
data();
750 size_t count = v.size();
751 int device = device_id.
asInt32();
752 cudaMemoryAdvise cuda_advise;
755 cuda_advise = cudaMemAdviseUnsetReadMostly;
757 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
759 cuda_advise = cudaMemAdviseUnsetAccessedBy;
761 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
762 device = cudaCpuDeviceId;
765 cuda_advise = cudaMemAdviseUnsetAccessedBy;
766 device = cudaCpuDeviceId;
770 ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
773 void setCurrentDevice(
DeviceId device_id)
final
777 ARCCORE_FATAL(
"Device {0} is not an accelerator device",
id);
778 ARCCORE_CHECK_CUDA(cudaSetDevice(
id));
781 const IDeviceInfoList* deviceInfoList()
final {
return &m_device_info_list; }
783 void startProfiling()
override
785 global_cupti_info.start();
788 void stopProfiling()
override
790 global_cupti_info.stop();
793 bool isProfilingActive()
override
795 return global_cupti_info.isActive();
798 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
800 cudaPointerAttributes ca;
801 ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
805 _fillPointerAttribute(attribute, mem_type, ca.device,
806 ptr, ca.devicePointer, ca.hostPointer);
812 int wanted_d = device_id.
asInt32();
813 ARCCORE_CHECK_CUDA(cudaGetDevice(&d));
815 ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));
817 size_t total_mem = 0;
818 ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
820 ARCCORE_CHECK_CUDA(cudaSetDevice(d));
822 dmi.setFreeMemory(free_mem);
823 dmi.setTotalMemory(total_mem);
827 void pushProfilerRange(
const String& name,
Int32 color_rgb)
override
829#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
830 if (color_rgb >= 0) {
833 nvtxEventAttributes_t eventAttrib;
834 std::memset(&eventAttrib, 0,
sizeof(nvtxEventAttributes_t));
835 eventAttrib.version = NVTX_VERSION;
836 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
837 eventAttrib.colorType = NVTX_COLOR_ARGB;
838 eventAttrib.color = color_rgb;
839 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
840 eventAttrib.message.ascii = name.
localstr();
841 nvtxRangePushEx(&eventAttrib);
847 void popProfilerRange()
override
849#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT
856 finalizeCudaMemoryAllocators(tm);
860 const void* kernel_ptr,
861 Int64 total_loop_size)
override
869 int nb_block_per_sm = 0;
870 ARCCORE_CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));
872 int max_block =
static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);
873 max_block = std::max(max_block, 1);
874 if (nb_block > max_block) {
877 return modified_args;
882 if (!m_use_computed_occupancy)
884 if (shared_memory < 0)
887 if (shared_memory != 0)
889 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
890 if (computed_block_size == 0)
896 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
897 int blocks_per_grid = CheckedConvert::toInt32(big_b);
900 return modified_args;
905 void fillDevices(
bool is_verbose);
909 m_use_computed_occupancy = v.value();
912 x = std::clamp(x, 10, 100);
913 m_cooperative_ratio = x / 100.0;
919 Int64 m_nb_kernel_launched = 0;
920 bool m_is_verbose =
false;
921 bool m_use_computed_occupancy =
false;
922 Int32 m_multi_processor_count = 0;
923 double m_cooperative_ratio = 1.0;
931void CudaRunnerRuntime::
932fillDevices(
bool is_verbose)
935 ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
936 std::ostream& omain = std::cout;
938 omain <<
"ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device <<
"\n";
939 for (
int i = 0; i < nb_device; ++i) {
941 cudaGetDeviceProperties(&dp, i);
942 int runtime_version = 0;
943 cudaRuntimeGetVersion(&runtime_version);
944 int driver_version = 0;
945 cudaDriverGetVersion(&driver_version);
946 std::ostringstream ostr;
947 std::ostream& o = ostr;
948 o <<
"Device " << i <<
" name=" << dp.name <<
"\n";
949 o <<
" Driver version = " << (driver_version / 1000) <<
"." << (driver_version % 1000) <<
"\n";
950 o <<
" Runtime version = " << (runtime_version / 1000) <<
"." << (runtime_version % 1000) <<
"\n";
951 o <<
" computeCapability = " << dp.major <<
"." << dp.minor <<
"\n";
952 o <<
" totalGlobalMem = " << dp.totalGlobalMem <<
"\n";
953 o <<
" sharedMemPerBlock = " << dp.sharedMemPerBlock <<
"\n";
954 o <<
" sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor <<
"\n";
955 o <<
" sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin <<
"\n";
956 o <<
" regsPerBlock = " << dp.regsPerBlock <<
"\n";
957 o <<
" warpSize = " << dp.warpSize <<
"\n";
958 o <<
" memPitch = " << dp.memPitch <<
"\n";
959 o <<
" maxThreadsPerBlock = " << dp.maxThreadsPerBlock <<
"\n";
960 o <<
" maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor <<
"\n";
961 o <<
" maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor <<
"\n";
962 o <<
" totalConstMem = " << dp.totalConstMem <<
"\n";
963 o <<
" cooperativeLaunch = " << dp.cooperativeLaunch <<
"\n";
964 o <<
" multiProcessorCount = " << dp.multiProcessorCount <<
"\n";
965 o <<
" integrated = " << dp.integrated <<
"\n";
966 o <<
" canMapHostMemory = " << dp.canMapHostMemory <<
"\n";
967 o <<
" directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost <<
"\n";
968 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
969 o <<
" pageableMemoryAccess = " << dp.pageableMemoryAccess <<
"\n";
970 o <<
" concurrentManagedAccess = " << dp.concurrentManagedAccess <<
"\n";
971 o <<
" pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables <<
"\n";
972 o <<
" hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported <<
"\n";
973 o <<
" maxThreadsDim = " << dp.maxThreadsDim[0] <<
" " << dp.maxThreadsDim[1]
974 <<
" " << dp.maxThreadsDim[2] <<
"\n";
975 o <<
" maxGridSize = " << dp.maxGridSize[0] <<
" " << dp.maxGridSize[1]
976 <<
" " << dp.maxGridSize[2] <<
"\n";
977 o <<
" pciInfo = " << dp.pciDomainID <<
" " << dp.pciBusID <<
" " << dp.pciDeviceID <<
"\n";
978 o <<
" memoryBusWitdh = " << dp.memoryBusWidth <<
" bits\n";
981 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, i));
982 o <<
" clockRate = " << (clock_rate / 1000) <<
" MHz\n";
984 int memory_clock_rate = 0;
985 ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&memory_clock_rate, cudaDevAttrMemoryClockRate, i));
986 o <<
" memoryClockRate = " << (memory_clock_rate / 1000) <<
" MHz\n";
988 Real memory_bandwith = ((dp.memoryBusWidth * memory_clock_rate * 2.0) / 8.0) / 1.0e6;
989 o <<
" MemoryBandwith = " << memory_bandwith <<
" GB/s\n";
991#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)
992 o <<
" deviceOverlap = " << dp.deviceOverlap <<
"\n";
993 o <<
" computeMode = " << dp.computeMode <<
"\n";
994 o <<
" kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled <<
"\n";
1000 m_multi_processor_count = dp.multiProcessorCount;
1004 int greatest_val = 0;
1005 ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
1006 o <<
" leastPriority = " << least_val <<
" greatestPriority = " << greatest_val <<
"\n";
1008 std::ostringstream device_uuid_ostr;
1011 ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));
1013 ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
1014 o <<
" deviceUuid=";
1015 Impl::printUUID(device_uuid_ostr, device_uuid.bytes);
1016 o << device_uuid_ostr.str();
1019 String description(ostr.str());
1021 omain << description;
1024 device_info.setDescription(description);
1025 device_info.setDeviceId(
DeviceId(i));
1026 device_info.setName(dp.name);
1027 device_info.setWarpSize(dp.warpSize);
1028 device_info.setUUIDAsString(device_uuid_ostr.str());
1029 device_info.setSharedMemoryPerBlock(
static_cast<Int32>(dp.sharedMemPerBlock));
1030 device_info.setSharedMemoryPerMultiprocessor(
static_cast<Int32>(dp.sharedMemPerMultiprocessor));
1031 device_info.setSharedMemoryPerBlockOptin(
static_cast<Int32>(dp.sharedMemPerBlockOptin));
1032 device_info.setTotalConstMemory(
static_cast<Int32>(dp.totalConstMem));
1033 device_info.setPCIDomainID(dp.pciDomainID);
1034 device_info.setPCIBusID(dp.pciBusID);
1035 device_info.setPCIDeviceID(dp.pciDeviceID);
1036 m_device_info_list.addDevice(device_info);
1039 Int32 global_cupti_level = 0;
1043 global_cupti_level = v.value();
1045 global_cupti_flush = v.value();
1046 bool do_print_cupti =
true;
1048 do_print_cupti = (v.value() != 0);
1050 if (global_cupti_level > 0) {
1051#ifndef ARCCORE_HAS_CUDA_CUPTI
1052 ARCCORE_FATAL(
"Trying to enable CUPTI but Arcane is not compiled with cupti support");
1054 global_cupti_info.init(global_cupti_level, do_print_cupti);
1055 global_cupti_info.start();
1076 ARCCORE_CHECK_CUDA(cudaMemcpy(to.
data(), from.
data(), from.
bytes().
size(), cudaMemcpyDefault));
1107extern "C" ARCCORE_EXPORT
void
1108arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
1110 using namespace Arcane::Accelerator::Cuda;
1111 global_cuda_runtime.build();
1112 Accelerator::Impl::setUsingCUDARuntime(
true);
1113 Accelerator::Impl::setCUDARunQueueRuntime(&global_cuda_runtime);
1114 initializeCudaMemoryAllocators();
1119 _setAllocator(&unified_memory_cuda_memory_allocator);
1120 _setAllocator(&host_pinned_cuda_memory_allocator);
1121 _setAllocator(&device_cuda_memory_allocator);
1122 mrm->
setCopier(&global_cuda_memory_copier);
1123 global_cuda_runtime.fillDevices(init_info.isVerbose());
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
Base class of a specific allocator for accelerator.
eMemoryResource memoryResource() const final
Memory resource provided by the allocator.
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialization for Device memory.
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialization for HostPinned memory.
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialization for UVM memory.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copies the data from from to to with the queue queue.
void barrier() override
Blocks until all actions associated with this queue are finished.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification before command launch.
bool _barrierNoException() override
Barrier without exception. Returns true in case of error.
Impl::NativeStream nativeStream() override
Pointer to the internal structure dependent on the implementation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Performs a prefetch of a memory region.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification of command launch completion.
void copyMemory(const MemoryCopyArgs &args) override
Performs a copy between two memory regions.
Singleton class to manage CUPTI.
Map containing the ideal occupancy for a given kernel.
void * allocateMemory(Int64 size) final
Allocates a block for size bytes.
void freeMemory(void *ptr, Int64 size) final
Frees the block located at address address containing size bytes.
bool m_use_hint_as_mainly_device
Allocator for unified memory.
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifies of a change in instance-specific arguments.
Identifier of a system component.
bool isHost() const
Indicates if the instance is associated with the host.
Int32 asInt32() const
Numerical value of the device.
bool isAccelerator() const
Indicates if the instance is associated with an accelerator.
Information about an accelerator.
Accelerator memory information.
Interface of a list of devices.
Interface for a list of devices.
Interface for event implementation.
Interface of an execution stream for a RunQueue.
Interface of the runtime associated with an accelerator.
Arguments for launching a kernel.
bool isCooperative() const
Indicates if running in cooperative mode (i.e. cudaLaunchCooperativeKernel).
Int32 nbBlockPerGrid() const
Number of grid blocks.
void setNbThreadPerBlock(Int32 v)
Number of threads per block.
void setNbBlockPerGrid(Int32 v)
Number of grid blocks.
Int32 nbThreadPerBlock() const
Number of threads per block.
Int32 sharedMemorySize() const
Shared memory to allocate for the kernel.
Opaque type to encapsulate a native 'stream'.
Implementation of a command for accelerator.
Memory prefetching arguments.
Information about a memory address.
Information to create a RunQueue.
bool isDefault() const
Indicates if the instance only has default values.
Execution queue for an accelerator.
bool isAsync() const
Indicates if the execution queue is asynchronous.
void copyMemory(const MemoryCopyArgs &args) const
Copies information between two memory regions.
Information about an allocated memory region.
Constant view on a contiguous memory region containing fixed-size elements.
constexpr SpanType bytes() const
View in byte form.
constexpr const std::byte * data() const
Pointer to the memory region.
Template class for converting a type.
Interface for memory copies with accelerator support.
Internal part of Arcane's 'IMemoryResourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Sets the allocator for resource r.
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Sets the memory pool for resource r.
virtual void setIsAccelerator(bool v)=0
Indicates if an accelerator is available.
virtual void setCopier(IMemoryCopier *copier)=0
Sets the copying instance.
virtual IMemoryResourceMngInternal * _internal()=0
Internal interface.
Class containing information to specialize allocations.
Mutable view on a contiguous memory region containing fixed-size elements.
constexpr std::byte * data() const
Pointer to the memory region.
constexpr SpanType bytes() const
View in byte form.
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Unicode character string.
const char * localstr() const
Returns the conversion of the instance into UTF-8 encoding.
eMemoryAdvice
Memory management advice.
@ AccessedByHost
Indicates that the memory region is accessed by the host.
@ PreferredLocationDevice
Prefers memory placement on the accelerator.
@ MostlyRead
Indicates that the memory region is primarily read-only.
@ PreferredLocationHost
Prefers memory placement on the host.
@ AccessedByDevice
Indicates that the memory region is accessed by the device.
ePointerMemoryType
Memory type for a pointer.
eExecutionPolicy
Execution policy for a Runner.
@ CUDA
Execution policy using the CUDA environment.
IMemoryRessourceMng * getDataMemoryResourceMng()
Memory resource manager for data.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Sets the specific allocator for accelerators.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Sets the memory resource used for the data memory allocator.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
eMemoryLocationHint
Indices for expected memory location.
@ MainlyHost
Indicates that the data will primarily be used on the CPU.
@ HostAndDeviceMostlyRead
Indicates that the data will be used both on the accelerator and on the CPU and will not be frequentl...
@ MainlyDevice
Indicates that the data will primarily be used on the accelerator.
double Real
Type representing a real number.
eMemoryResource
List of available memory resources.
@ HostPinned
Allocates on the host.
@ UnifiedMemory
Allocates using unified memory.
@ Device
Allocates on the device.
std::int32_t Int32
Signed integer type of 32 bits.