14#include "arcane/utils/Profiling.h"
15#include "arcane/utils/FixedArray.h"
16#include "arcane/utils/internal/ProfilingInternal.h"
18#include "arcane/accelerator/cuda/CudaAccelerator.h"
20#include "arcane/accelerator/core/internal/MemoryTracer.h"
21#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
28namespace Arcane::Accelerator::Cuda
33 bool global_do_print =
true;
39void arcaneCheckCudaErrors(
const TraceInfo& ti, CUptiResult e)
41 if (e == CUPTI_SUCCESS)
44 const char* error_message =
nullptr;
45 CUptiResult e3 = cuptiGetResultString(e, &error_message);
46 if (e3 != CUPTI_SUCCESS)
47 error_message =
"Unknown";
50 ti, e, error_message);
57getStallReasonString(CUpti_ActivityPCSamplingStallReason reason)
60 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID:
62 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE:
64 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH:
65 return "Instruction fetch";
66 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY:
67 return "Execution dependency";
68 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY:
69 return "Memory dependency";
70 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE:
72 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC:
74 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY:
75 return "Constant memory dependency";
76 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY:
78 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE:
79 return "Memory throttle";
80 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED:
81 return "Not selected";
82 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER:
84 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING:
97getUvmCounterKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)
100 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
101 return "BYTES_TRANSFER_HTOD";
102 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
103 return "BYTES_TRANSFER_DTOH";
104 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
105 return "CPU_PAGE_FAULT";
106 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
107 return "GPU_PAGE_FAULT";
117static uint64_t startTimestamp = 0;
120printActivity(AcceleratorStatInfoList* stat_info,
121 CUpti_Activity* record,
bool do_print, std::ostream& ostr)
123 switch (record->kind) {
124 case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER: {
125 auto* uvm =
reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2*
>(record);
126 Int64 nb_byte = uvm->value;
128 void* address =
reinterpret_cast<void*
>(uvm->address);
129 std::pair<String, String> mem_info = impl::MemoryTracer::findMemory(address);
130 ostr <<
"UNIFIED_MEMORY_COUNTER [ " << (uvm->start - startTimestamp) <<
" " << (uvm->end - startTimestamp) <<
" ]"
131 <<
" address=" << address
132 <<
" kind=" << getUvmCounterKindString(uvm->counterKind)
133 <<
" value=" << nb_byte
134 <<
" flags=" << uvm->flags
135 <<
" source=" << uvm->srcId <<
" destination=" << uvm->dstId
136 <<
" name=" << mem_info.first
137 <<
" stack=" << mem_info.second
141 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD)
142 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::HostToDevice, nb_byte);
143 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH)
144 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::DeviceToHost, nb_byte);
145 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT)
147 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Cpu, 1);
148 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT) {
149 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Gpu, nb_byte);
154 case CUPTI_ACTIVITY_KIND_KERNEL:
155 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
156 const char* kindString = (record->kind == CUPTI_ACTIVITY_KIND_KERNEL) ?
"KERNEL" :
"CONC KERNEL";
159 auto* kernel =
reinterpret_cast<CUpti_ActivityKernel5*
>(record);
161 ostr << kindString <<
" [ " << (kernel->start - startTimestamp) <<
" - " << (kernel->end - startTimestamp)
162 <<
" - " << (kernel->end - kernel->start) <<
" ]"
163 <<
" device=" << kernel->deviceId <<
" context=" << kernel->contextId
164 <<
" stream=" << kernel->streamId <<
" correlation=" << kernel->correlationId;
165 ostr <<
" grid=[" << kernel->gridX <<
"," << kernel->gridY <<
"," << kernel->gridZ <<
"]"
166 <<
" block=[" << kernel->blockX <<
"," << kernel->blockY <<
"," << kernel->blockZ <<
"]"
167 <<
" shared memory (static=" << kernel->staticSharedMemory <<
" dynamic=" << kernel->dynamicSharedMemory <<
")"
168 <<
" registers=" << kernel->registersPerThread
169 <<
" name=" <<
'"' << kernel->name <<
'"'
174 case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: {
175 auto* source_locator =
reinterpret_cast<CUpti_ActivitySourceLocator*
>(record);
177 ostr <<
"Source Locator Id " << source_locator->id
178 <<
" File " << source_locator->fileName
179 <<
" Line " << source_locator->lineNumber
184 case CUPTI_ACTIVITY_KIND_PC_SAMPLING: {
185 auto* ps_record =
reinterpret_cast<CUpti_ActivityPCSampling3*
>(record);
188 ostr <<
"source " << ps_record->sourceLocatorId <<
" functionId " << ps_record->functionId
189 <<
" pc " << ps_record->pcOffset <<
" correlation " << ps_record->correlationId
190 <<
" samples " << ps_record->samples
191 <<
" latency samples " << ps_record->latencySamples
192 <<
" stallreason " << getStallReasonString(ps_record->stallReason)
197 case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO: {
198 auto* pcsri_result =
reinterpret_cast<CUpti_ActivityPCSamplingRecordInfo*
>(record);
201 ostr <<
"correlation " << pcsri_result->correlationId
202 <<
" totalSamples " << pcsri_result->totalSamples
203 <<
" droppedSamples " << pcsri_result->droppedSamples
204 <<
" samplingPeriodInCycles " << pcsri_result->samplingPeriodInCycles
209 case CUPTI_ACTIVITY_KIND_FUNCTION: {
210 auto* func_result =
reinterpret_cast<CUpti_ActivityFunction*
>(record);
213 ostr <<
"id " << func_result->id <<
" ctx " << func_result->contextId
214 <<
" moduleId " << func_result->moduleId
215 <<
" functionIndex " << func_result->functionIndex
216 <<
" name " << func_result->name
224 ostr <<
" <unknown>\n";
234arcaneCuptiBufferRequested(uint8_t** buffer,
size_t* size,
size_t* maxNumRecords)
236 const Int32 BUF_SIZE = 16 * 4096;
241 *buffer =
new (std::align_val_t{ 8 }) uint8_t[BUF_SIZE];
249arcaneCuptiBufferCompleted(CUcontext ctx, uint32_t stream_id, uint8_t* buffer,
250 [[maybe_unused]]
size_t size,
size_t validSize)
256 CUpti_Activity* record =
nullptr;
259 std::ostringstream ostr;
261 status = cuptiActivityGetNextRecord(buffer, validSize, &record);
262 if (status == CUPTI_SUCCESS) {
263 printActivity(stat_info, record, global_do_print, ostr);
265 else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
269 ARCANE_CHECK_CUDA(status);
272 std::cout << ostr.str();
274 size_t nb_dropped = 0;
275 ARCANE_CHECK_CUDA(cuptiActivityGetNumDroppedRecords(ctx, stream_id, &nb_dropped));
277 std::cout <<
"WARNING: Dropped " << nb_dropped <<
" activity records\n";
291 global_do_print = m_do_print;
294 cudaGetDevice(&device_id);
295 int level = m_profiling_level;
297 ARCANE_CHECK_CUDA(cuptiActivityRegisterCallbacks(arcaneCuptiBufferRequested, arcaneCuptiBufferCompleted));
299 config[0].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
300 config[0].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
301 config[0].deviceId = device_id;
302 config[0].enable = 1;
304 config[1].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
305 config[1].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
306 config[1].deviceId = device_id;
307 config[1].enable = 1;
309 config[2].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
310 config[2].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT;
311 config[2].deviceId = device_id;
312 config[2].enable = 1;
314 config[3].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
315 config[3].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT;
316 config[3].deviceId = device_id;
317 config[3].enable = 1;
319 ARCANE_CHECK_CUDA(cuptiActivityConfigureUnifiedMemoryCounter(config.data(), config.size()));
324 configPC.size =
sizeof(CUpti_ActivityPCSamplingConfig);
325 configPC.samplingPeriod = CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN;
326 configPC.samplingPeriod2 = 0;
328 cuCtxGetCurrent(&cuCtx);
329 ARCANE_CHECK_CUDA(cuptiActivityConfigurePCSampling(cuCtx, &configPC));
336 ARCANE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
338 ARCANE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
340 ARCANE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
342 ARCANE_CHECK_CUDA(cuptiGetTimestamp(&startTimestamp));
357 int level = m_profiling_level;
360 ARCANE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
362 ARCANE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
364 ARCANE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
366 ARCANE_CHECK_CUDA(cuptiActivityFlushAll(0));
367 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
382 ARCANE_CHECK_CUDA(cuptiActivityFlushAll(0));
389initCupti(
bool do_print)
391 global_do_print = do_print;
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Lecteur des fichiers de maillage via la bibliothèque LIMA.
static impl::AcceleratorStatInfoList * _threadLocalAcceleratorInstance()
Statistiques pour les accélérateurs.