14#include "arccore/base/Profiling.h"
15#include "arccore/base/FixedArray.h"
16#include "arccore/base/FatalErrorException.h"
17#include "arccore/base/internal/ProfilingInternal.h"
19#include "arccore/accelerator_native/CudaAccelerator.h"
21#include "arccore/common/accelerator/internal/MemoryTracer.h"
22#include "arccore/accelerator_native/runtime/Cupti.h"
29namespace Arcane::Accelerator::Cuda
31using Arcane::Impl::AcceleratorStatInfoList;
34 bool global_do_print =
true;
40void arcaneCheckCudaErrors(
const TraceInfo& ti, CUptiResult e)
42 if (e == CUPTI_SUCCESS)
45 const char* error_message =
nullptr;
46 CUptiResult e3 = cuptiGetResultString(e, &error_message);
47 if (e3 != CUPTI_SUCCESS)
48 error_message =
"Unknown";
51 ti, e, error_message);
58getStallReasonString(CUpti_ActivityPCSamplingStallReason reason)
61 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID:
63 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE:
65 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH:
66 return "Instruction fetch";
67 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY:
68 return "Execution dependency";
69 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY:
70 return "Memory dependency";
71 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE:
73 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC:
75 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY:
76 return "Constant memory dependency";
77 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY:
79 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE:
80 return "Memory throttle";
81 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED:
82 return "Not selected";
83 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER:
85 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING:
98getUvmCounterKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)
101 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
102 return "BYTES_TRANSFER_HTOD";
103 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
104 return "BYTES_TRANSFER_DTOH";
105 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
106 return "CPU_PAGE_FAULT";
107 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
108 return "GPU_PAGE_FAULT";
118static uint64_t startTimestamp = 0;
122 CUpti_Activity* record,
bool do_print, std::ostream& ostr)
124 switch (record->kind) {
125 case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER: {
126 auto* uvm =
reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2*
>(record);
127 Int64 nb_byte = uvm->value;
129 void* address =
reinterpret_cast<void*
>(uvm->address);
130 std::pair<String, String> mem_info = impl::MemoryTracer::findMemory(address);
131 ostr <<
"UNIFIED_MEMORY_COUNTER [ " << (uvm->start - startTimestamp) <<
" " << (uvm->end - startTimestamp) <<
" ]"
132 <<
" address=" << address
133 <<
" kind=" << getUvmCounterKindString(uvm->counterKind)
134 <<
" value=" << nb_byte
135 <<
" flags=" << uvm->flags
136 <<
" source=" << uvm->srcId <<
" destination=" << uvm->dstId
137 <<
" name=" << mem_info.first
138 <<
" stack=" << mem_info.second
142 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD)
143 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::HostToDevice, nb_byte);
144 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH)
145 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::DeviceToHost, nb_byte);
146 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT)
148 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Cpu, 1);
149 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT) {
150 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Gpu, nb_byte);
155 case CUPTI_ACTIVITY_KIND_KERNEL:
156 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
157 const char* kindString = (record->kind == CUPTI_ACTIVITY_KIND_KERNEL) ?
"KERNEL" :
"CONC KERNEL";
160 auto* kernel =
reinterpret_cast<CUpti_ActivityKernel5*
>(record);
162 ostr << kindString <<
" [ " << (kernel->start - startTimestamp) <<
" - " << (kernel->end - startTimestamp)
163 <<
" - " << (kernel->end - kernel->start) <<
" ]"
164 <<
" device=" << kernel->deviceId <<
" context=" << kernel->contextId
165 <<
" stream=" << kernel->streamId <<
" correlation=" << kernel->correlationId;
166 ostr <<
" grid=[" << kernel->gridX <<
"," << kernel->gridY <<
"," << kernel->gridZ <<
"]"
167 <<
" block=[" << kernel->blockX <<
"," << kernel->blockY <<
"," << kernel->blockZ <<
"]"
168 <<
" shared memory (static=" << kernel->staticSharedMemory <<
" dynamic=" << kernel->dynamicSharedMemory <<
")"
169 <<
" registers=" << kernel->registersPerThread
170 <<
" name=" <<
'"' << kernel->name <<
'"'
175 case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: {
176 auto* source_locator =
reinterpret_cast<CUpti_ActivitySourceLocator*
>(record);
178 ostr <<
"Source Locator Id " << source_locator->id
179 <<
" File " << source_locator->fileName
180 <<
" Line " << source_locator->lineNumber
185 case CUPTI_ACTIVITY_KIND_PC_SAMPLING: {
186 auto* ps_record =
reinterpret_cast<CUpti_ActivityPCSampling3*
>(record);
189 ostr <<
"source " << ps_record->sourceLocatorId <<
" functionId " << ps_record->functionId
190 <<
" pc " << ps_record->pcOffset <<
" correlation " << ps_record->correlationId
191 <<
" samples " << ps_record->samples
192 <<
" latency samples " << ps_record->latencySamples
193 <<
" stallreason " << getStallReasonString(ps_record->stallReason)
198 case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO: {
199 auto* pcsri_result =
reinterpret_cast<CUpti_ActivityPCSamplingRecordInfo*
>(record);
202 ostr <<
"correlation " << pcsri_result->correlationId
203 <<
" totalSamples " << pcsri_result->totalSamples
204 <<
" droppedSamples " << pcsri_result->droppedSamples
205 <<
" samplingPeriodInCycles " << pcsri_result->samplingPeriodInCycles
210 case CUPTI_ACTIVITY_KIND_FUNCTION: {
211 auto* func_result =
reinterpret_cast<CUpti_ActivityFunction*
>(record);
214 ostr <<
"id " << func_result->id <<
" ctx " << func_result->contextId
215 <<
" moduleId " << func_result->moduleId
216 <<
" functionIndex " << func_result->functionIndex
217 <<
" name " << func_result->name
225 ostr <<
" <unknown>\n";
235arcaneCuptiBufferRequested(uint8_t** buffer,
size_t* size,
size_t* maxNumRecords)
237 const Int32 BUF_SIZE = 16 * 4096;
242 *buffer =
new (std::align_val_t{ 8 }) uint8_t[BUF_SIZE];
250arcaneCuptiBufferCompleted(CUcontext ctx, uint32_t stream_id, uint8_t* buffer,
251 [[maybe_unused]]
size_t size,
size_t validSize)
257 CUpti_Activity* record =
nullptr;
260 std::ostringstream ostr;
262 status = cuptiActivityGetNextRecord(buffer, validSize, &record);
263 if (status == CUPTI_SUCCESS) {
264 printActivity(stat_info, record, global_do_print, ostr);
266 else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
270 ARCCORE_CHECK_CUDA(status);
273 std::cout << ostr.str();
275 size_t nb_dropped = 0;
276 ARCCORE_CHECK_CUDA(cuptiActivityGetNumDroppedRecords(ctx, stream_id, &nb_dropped));
278 std::cout <<
"WARNING: Dropped " << nb_dropped <<
" activity records\n";
292 global_do_print = m_do_print;
295 cudaGetDevice(&device_id);
296 int level = m_profiling_level;
298 ARCCORE_CHECK_CUDA(cuptiActivityRegisterCallbacks(arcaneCuptiBufferRequested, arcaneCuptiBufferCompleted));
300 config[0].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
301 config[0].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
302 config[0].deviceId = device_id;
303 config[0].enable = 1;
305 config[1].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
306 config[1].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
307 config[1].deviceId = device_id;
308 config[1].enable = 1;
310 config[2].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
311 config[2].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT;
312 config[2].deviceId = device_id;
313 config[2].enable = 1;
315 config[3].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
316 config[3].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT;
317 config[3].deviceId = device_id;
318 config[3].enable = 1;
320 ARCCORE_CHECK_CUDA(cuptiActivityConfigureUnifiedMemoryCounter(config.data(), config.size()));
325 configPC.size =
sizeof(CUpti_ActivityPCSamplingConfig);
326 configPC.samplingPeriod = CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN;
327 configPC.samplingPeriod2 = 0;
329 cuCtxGetCurrent(&cuCtx);
330 ARCCORE_CHECK_CUDA(cuptiActivityConfigurePCSampling(cuCtx, &configPC));
337 ARCCORE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
339 ARCCORE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
341 ARCCORE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
343 ARCCORE_CHECK_CUDA(cuptiGetTimestamp(&startTimestamp));
358 int level = m_profiling_level;
361 ARCCORE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
363 ARCCORE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
365 ARCCORE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
367 ARCCORE_CHECK_CUDA(cuptiActivityFlushAll(0));
368 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
383 ARCCORE_CHECK_CUDA(cuptiActivityFlushAll(0));
390initCupti(
bool do_print)
392 global_do_print = do_print;
#define ARCCORE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Statistiques pour les accélérateurs.
static Impl::AcceleratorStatInfoList * _threadLocalAcceleratorInstance()
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.