Arcane  v4.1.1.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
Cupti.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Cupti.cc (C) 2000-2025 */
9/* */
10/* Intégration de CUPTI. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/base/Profiling.h"
15#include "arccore/base/FixedArray.h"
16#include "arccore/base/FatalErrorException.h"
17#include "arccore/base/internal/ProfilingInternal.h"
18
19#include "arccore/accelerator_native/CudaAccelerator.h"
20
21#include "arccore/common/accelerator/internal/MemoryTracer.h"
22#include "arccore/accelerator_native/runtime/Cupti.h"
23
24#include <iostream>
25
26/*---------------------------------------------------------------------------*/
27/*---------------------------------------------------------------------------*/
28
29namespace Arcane::Accelerator::Cuda
30{
31using Arcane::Impl::AcceleratorStatInfoList;
32namespace
33{
34 bool global_do_print = true;
35}
36
37/*---------------------------------------------------------------------------*/
38/*---------------------------------------------------------------------------*/
39
40void arcaneCheckCudaErrors(const TraceInfo& ti, CUptiResult e)
41{
42 if (e == CUPTI_SUCCESS)
43 return;
44
45 const char* error_message = nullptr;
46 CUptiResult e3 = cuptiGetResultString(e, &error_message);
47 if (e3 != CUPTI_SUCCESS)
48 error_message = "Unknown";
49
50 ARCCORE_FATAL("CUpti Error trace={0} e={1} message={2}",
51 ti, e, error_message);
52}
53
54/*---------------------------------------------------------------------------*/
55/*---------------------------------------------------------------------------*/
56
57static const char*
58getStallReasonString(CUpti_ActivityPCSamplingStallReason reason)
59{
60 switch (reason) {
61 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID:
62 return "Invalid";
63 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE:
64 return "Selected";
65 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH:
66 return "Instruction fetch";
67 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY:
68 return "Execution dependency";
69 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY:
70 return "Memory dependency";
71 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE:
72 return "Texture";
73 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC:
74 return "Sync";
75 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY:
76 return "Constant memory dependency";
77 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY:
78 return "Pipe busy";
79 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE:
80 return "Memory throttle";
81 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED:
82 return "Not selected";
83 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER:
84 return "Other";
85 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING:
86 return "Sleeping";
87 default:
88 break;
89 }
90
91 return "<unknown>";
92}
93
94/*---------------------------------------------------------------------------*/
95/*---------------------------------------------------------------------------*/
96
97static const char*
98getUvmCounterKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)
99{
100 switch (kind) {
101 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
102 return "BYTES_TRANSFER_HTOD";
103 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
104 return "BYTES_TRANSFER_DTOH";
105 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
106 return "CPU_PAGE_FAULT";
107 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
108 return "GPU_PAGE_FAULT";
109 default:
110 break;
111 }
112 return "<unknown>";
113}
114
115/*---------------------------------------------------------------------------*/
116/*---------------------------------------------------------------------------*/
117
118static uint64_t startTimestamp = 0;
119
120static void
121printActivity(AcceleratorStatInfoList* stat_info,
122 CUpti_Activity* record, bool do_print, std::ostream& ostr)
123{
124 switch (record->kind) {
125 case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER: {
126 auto* uvm = reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2*>(record);
127 Int64 nb_byte = uvm->value;
128 if (do_print) {
129 void* address = reinterpret_cast<void*>(uvm->address);
130 std::pair<String, String> mem_info = impl::MemoryTracer::findMemory(address);
131 ostr << "UNIFIED_MEMORY_COUNTER [ " << (uvm->start - startTimestamp) << " " << (uvm->end - startTimestamp) << " ]"
132 << " address=" << address
133 << " kind=" << getUvmCounterKindString(uvm->counterKind)
134 << " value=" << nb_byte
135 << " flags=" << uvm->flags
136 << " source=" << uvm->srcId << " destination=" << uvm->dstId
137 << " name=" << mem_info.first
138 << " stack=" << mem_info.second
139 << "\n";
140 }
141 if (stat_info) {
142 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD)
143 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::HostToDevice, nb_byte);
144 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH)
145 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::DeviceToHost, nb_byte);
146 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT)
147 // TODO: regarder à quoi correspond uvw->value pour cet évènement
148 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Cpu, 1);
149 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT) {
150 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Gpu, nb_byte);
151 }
152 }
153 break;
154 }
155 case CUPTI_ACTIVITY_KIND_KERNEL:
156 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
157 const char* kindString = (record->kind == CUPTI_ACTIVITY_KIND_KERNEL) ? "KERNEL" : "CONC KERNEL";
158 // NOTE: 'CUpti_ActivityKernel5' est disponible à partir de CUDA 11.0 mais obsolète à partir de CUDA 11.2
159 // à partir de Cuda 12 on pourra utiliser 'CUpti_ActivityKernel9'.
160 auto* kernel = reinterpret_cast<CUpti_ActivityKernel5*>(record);
161 if (do_print) {
162 ostr << kindString << " [ " << (kernel->start - startTimestamp) << " - " << (kernel->end - startTimestamp)
163 << " - " << (kernel->end - kernel->start) << " ]"
164 << " device=" << kernel->deviceId << " context=" << kernel->contextId
165 << " stream=" << kernel->streamId << " correlation=" << kernel->correlationId;
166 ostr << " grid=[" << kernel->gridX << "," << kernel->gridY << "," << kernel->gridZ << "]"
167 << " block=[" << kernel->blockX << "," << kernel->blockY << "," << kernel->blockZ << "]"
168 << " shared memory (static=" << kernel->staticSharedMemory << " dynamic=" << kernel->dynamicSharedMemory << ")"
169 << " registers=" << kernel->registersPerThread
170 << " name=" << '"' << kernel->name << '"'
171 << "\n";
172 }
173 break;
174 }
175 case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: {
176 auto* source_locator = reinterpret_cast<CUpti_ActivitySourceLocator*>(record);
177 if (do_print) {
178 ostr << "Source Locator Id " << source_locator->id
179 << " File " << source_locator->fileName
180 << " Line " << source_locator->lineNumber
181 << "\n";
182 }
183 break;
184 }
185 case CUPTI_ACTIVITY_KIND_PC_SAMPLING: {
186 auto* ps_record = reinterpret_cast<CUpti_ActivityPCSampling3*>(record);
187
188 if (do_print) {
189 ostr << "source " << ps_record->sourceLocatorId << " functionId " << ps_record->functionId
190 << " pc " << ps_record->pcOffset << " correlation " << ps_record->correlationId
191 << " samples " << ps_record->samples
192 << " latency samples " << ps_record->latencySamples
193 << " stallreason " << getStallReasonString(ps_record->stallReason)
194 << "\n";
195 }
196 break;
197 }
198 case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO: {
199 auto* pcsri_result = reinterpret_cast<CUpti_ActivityPCSamplingRecordInfo*>(record);
200
201 if (do_print) {
202 ostr << "correlation " << pcsri_result->correlationId
203 << " totalSamples " << pcsri_result->totalSamples
204 << " droppedSamples " << pcsri_result->droppedSamples
205 << " samplingPeriodInCycles " << pcsri_result->samplingPeriodInCycles
206 << "\n";
207 }
208 break;
209 }
210 case CUPTI_ACTIVITY_KIND_FUNCTION: {
211 auto* func_result = reinterpret_cast<CUpti_ActivityFunction*>(record);
212
213 if (do_print) {
214 ostr << "id " << func_result->id << " ctx " << func_result->contextId
215 << " moduleId " << func_result->moduleId
216 << " functionIndex " << func_result->functionIndex
217 << " name " << func_result->name
218 << "\n";
219 }
220 break;
221 }
222
223 default:
224 if (do_print) {
225 ostr << " <unknown>\n";
226 }
227 break;
228 }
229}
230
231/*---------------------------------------------------------------------------*/
232/*---------------------------------------------------------------------------*/
233
234static void CUPTIAPI
235arcaneCuptiBufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords)
236{
237 const Int32 BUF_SIZE = 16 * 4096;
238
239 // TODO: utiliser un ou plusieurs buffers pré-alloués pour éviter les
240 // successions d'allocations/désallocations.
241 *size = BUF_SIZE;
242 *buffer = new (std::align_val_t{ 8 }) uint8_t[BUF_SIZE];
243 *maxNumRecords = 0;
244}
245
246/*---------------------------------------------------------------------------*/
247/*---------------------------------------------------------------------------*/
248
249static void CUPTIAPI
250arcaneCuptiBufferCompleted(CUcontext ctx, uint32_t stream_id, uint8_t* buffer,
251 [[maybe_unused]] size_t size, size_t validSize)
252{
253 // NOTE: il semble que cette méthode soit toujours appelée depuis
254 // un thread spécifique créé par le runtime CUDA.
255
256 CUptiResult status;
257 CUpti_Activity* record = nullptr;
258
260 std::ostringstream ostr;
261 do {
262 status = cuptiActivityGetNextRecord(buffer, validSize, &record);
263 if (status == CUPTI_SUCCESS) {
264 printActivity(stat_info, record, global_do_print, ostr);
265 }
266 else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
267 break;
268 }
269 else {
270 ARCCORE_CHECK_CUDA(status);
271 }
272 } while (1);
273 std::cout << ostr.str();
274 // report any records dropped from the queue
275 size_t nb_dropped = 0;
276 ARCCORE_CHECK_CUDA(cuptiActivityGetNumDroppedRecords(ctx, stream_id, &nb_dropped));
277 if (nb_dropped != 0)
278 std::cout << "WARNING: Dropped " << nb_dropped << " activity records\n";
279
280 delete[] buffer;
281}
282
283/*---------------------------------------------------------------------------*/
284/*---------------------------------------------------------------------------*/
285
286void CuptiInfo::
287start()
288{
289 if (m_is_active)
290 return;
291
292 global_do_print = m_do_print;
293
294 int device_id = 0;
295 cudaGetDevice(&device_id);
296 int level = m_profiling_level;
297
298 ARCCORE_CHECK_CUDA(cuptiActivityRegisterCallbacks(arcaneCuptiBufferRequested, arcaneCuptiBufferCompleted));
299
300 config[0].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
301 config[0].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
302 config[0].deviceId = device_id;
303 config[0].enable = 1;
304
305 config[1].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
306 config[1].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
307 config[1].deviceId = device_id;
308 config[1].enable = 1;
309
310 config[2].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
311 config[2].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT;
312 config[2].deviceId = device_id;
313 config[2].enable = 1;
314
315 config[3].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
316 config[3].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT;
317 config[3].deviceId = device_id;
318 config[3].enable = 1;
319
320 ARCCORE_CHECK_CUDA(cuptiActivityConfigureUnifiedMemoryCounter(config.data(), config.size()));
321
322 // NOTE: un seul processus peut utiliser le sampling. Si on utilise MPI avec plusieurs
323 // rangs il ne faut pas activer le sampling
324 if (level >= 3) {
325 configPC.size = sizeof(CUpti_ActivityPCSamplingConfig);
326 configPC.samplingPeriod = CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN;
327 configPC.samplingPeriod2 = 0;
328 CUcontext cuCtx;
329 cuCtxGetCurrent(&cuCtx);
330 ARCCORE_CHECK_CUDA(cuptiActivityConfigurePCSampling(cuCtx, &configPC));
331 }
332
333 // Active les compteurs
334 // CONCURRENT_KERNEL et PC_SAMPLING ne sont pas compatibles
335 // Si on ajoute des compteurs ici il faut les désactiver dans stop()
336 if (level >= 1)
337 ARCCORE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
338 if (level == 2)
339 ARCCORE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
340 if (level >= 3)
341 ARCCORE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
342
343 ARCCORE_CHECK_CUDA(cuptiGetTimestamp(&startTimestamp));
344
345 // Mettre à la fin pour qu'en cas d'exception on considère l'initialisation
346 // non effectuée.
347 m_is_active = true;
348}
349
350/*---------------------------------------------------------------------------*/
351/*---------------------------------------------------------------------------*/
352
353void CuptiInfo::
354stop()
355{
356 if (!m_is_active)
357 return;
358 int level = m_profiling_level;
359
360 if (level >= 1)
361 ARCCORE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
362 if (level == 2)
363 ARCCORE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
364 if (level >= 3)
365 ARCCORE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
366
367 ARCCORE_CHECK_CUDA(cuptiActivityFlushAll(0));
368 ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());
369
370 m_is_active = false;
371}
372
373/*---------------------------------------------------------------------------*/
374/*---------------------------------------------------------------------------*/
375
376void CuptiInfo::
377flush()
378{
379 // Il ne faut pas faire de flush si CUPTI n'a pas démarré car cela provoque
380 // une erreur.
381 if (!m_is_active)
382 return;
383 ARCCORE_CHECK_CUDA(cuptiActivityFlushAll(0));
384}
385
386/*---------------------------------------------------------------------------*/
387/*---------------------------------------------------------------------------*/
388
389extern "C++" void
390initCupti(bool do_print)
391{
392 global_do_print = do_print;
393}
394
395/*---------------------------------------------------------------------------*/
396/*---------------------------------------------------------------------------*/
397
398} // namespace Arcane::Accelerator::Cuda
399
400/*---------------------------------------------------------------------------*/
401/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Statistiques pour les accélérateurs.
static Impl::AcceleratorStatInfoList * _threadLocalAcceleratorInstance()
Definition Profiling.cc:203
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.