Arcane  v3.15.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
Cupti.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Cupti.cc (C) 2000-2024 */
9/* */
10/* Intégration de CUPTI. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/utils/Profiling.h"
15#include "arcane/utils/FixedArray.h"
16#include "arcane/utils/internal/ProfilingInternal.h"
17
18#include "arcane/accelerator/cuda/CudaAccelerator.h"
19
20#include "arcane/accelerator/core/internal/MemoryTracer.h"
21#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
22
23#include <iostream>
24
25/*---------------------------------------------------------------------------*/
26/*---------------------------------------------------------------------------*/
27
28namespace Arcane::Accelerator::Cuda
29{
31namespace
32{
33 bool global_do_print = true;
34}
35
36/*---------------------------------------------------------------------------*/
37/*---------------------------------------------------------------------------*/
38
39void arcaneCheckCudaErrors(const TraceInfo& ti, CUptiResult e)
40{
41 if (e == CUPTI_SUCCESS)
42 return;
43
44 const char* error_message = nullptr;
45 CUptiResult e3 = cuptiGetResultString(e, &error_message);
46 if (e3 != CUPTI_SUCCESS)
47 error_message = "Unknown";
48
49 ARCANE_FATAL("CUpti Error trace={0} e={1} message={2}",
50 ti, e, error_message);
51}
52
53/*---------------------------------------------------------------------------*/
54/*---------------------------------------------------------------------------*/
55
56static const char*
57getStallReasonString(CUpti_ActivityPCSamplingStallReason reason)
58{
59 switch (reason) {
60 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INVALID:
61 return "Invalid";
62 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NONE:
63 return "Selected";
64 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_INST_FETCH:
65 return "Instruction fetch";
66 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_EXEC_DEPENDENCY:
67 return "Execution dependency";
68 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_DEPENDENCY:
69 return "Memory dependency";
70 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_TEXTURE:
71 return "Texture";
72 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SYNC:
73 return "Sync";
74 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_CONSTANT_MEMORY_DEPENDENCY:
75 return "Constant memory dependency";
76 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_PIPE_BUSY:
77 return "Pipe busy";
78 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_MEMORY_THROTTLE:
79 return "Memory throttle";
80 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_NOT_SELECTED:
81 return "Not selected";
82 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_OTHER:
83 return "Other";
84 case CUPTI_ACTIVITY_PC_SAMPLING_STALL_SLEEPING:
85 return "Sleeping";
86 default:
87 break;
88 }
89
90 return "<unknown>";
91}
92
93/*---------------------------------------------------------------------------*/
94/*---------------------------------------------------------------------------*/
95
96static const char*
97getUvmCounterKindString(CUpti_ActivityUnifiedMemoryCounterKind kind)
98{
99 switch (kind) {
100 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD:
101 return "BYTES_TRANSFER_HTOD";
102 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH:
103 return "BYTES_TRANSFER_DTOH";
104 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT:
105 return "CPU_PAGE_FAULT";
106 case CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT:
107 return "GPU_PAGE_FAULT";
108 default:
109 break;
110 }
111 return "<unknown>";
112}
113
114/*---------------------------------------------------------------------------*/
115/*---------------------------------------------------------------------------*/
116
117static uint64_t startTimestamp = 0;
118
119static void
120printActivity(AcceleratorStatInfoList* stat_info,
121 CUpti_Activity* record, bool do_print, std::ostream& ostr)
122{
123 switch (record->kind) {
124 case CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER: {
125 auto* uvm = reinterpret_cast<CUpti_ActivityUnifiedMemoryCounter2*>(record);
126 Int64 nb_byte = uvm->value;
127 if (do_print) {
128 void* address = reinterpret_cast<void*>(uvm->address);
129 std::pair<String, String> mem_info = impl::MemoryTracer::findMemory(address);
130 ostr << "UNIFIED_MEMORY_COUNTER [ " << (uvm->start - startTimestamp) << " " << (uvm->end - startTimestamp) << " ]"
131 << " address=" << address
132 << " kind=" << getUvmCounterKindString(uvm->counterKind)
133 << " value=" << nb_byte
134 << " flags=" << uvm->flags
135 << " source=" << uvm->srcId << " destination=" << uvm->dstId
136 << " name=" << mem_info.first
137 << " stack=" << mem_info.second
138 << "\n";
139 }
140 if (stat_info) {
141 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD)
142 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::HostToDevice, nb_byte);
143 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH)
144 stat_info->addMemoryTransfer(AcceleratorStatInfoList::eMemoryTransferType::DeviceToHost, nb_byte);
145 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT)
146 // TODO: regarder à quoi correspond uvw->value pour cet évènement
147 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Cpu, 1);
148 if (uvm->counterKind == CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT) {
149 stat_info->addMemoryPageFault(AcceleratorStatInfoList::eMemoryPageFaultType::Gpu, nb_byte);
150 }
151 }
152 break;
153 }
154 case CUPTI_ACTIVITY_KIND_KERNEL:
155 case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: {
156 const char* kindString = (record->kind == CUPTI_ACTIVITY_KIND_KERNEL) ? "KERNEL" : "CONC KERNEL";
157 // NOTE: 'CUpti_ActivityKernel5' est disponible à partir de CUDA 11.0 mais obsolète à partir de CUDA 11.2
158 // à partir de Cuda 12 on pourra utiliser 'CUpti_ActivityKernel9'.
159 auto* kernel = reinterpret_cast<CUpti_ActivityKernel5*>(record);
160 if (do_print) {
161 ostr << kindString << " [ " << (kernel->start - startTimestamp) << " - " << (kernel->end - startTimestamp)
162 << " - " << (kernel->end - kernel->start) << " ]"
163 << " device=" << kernel->deviceId << " context=" << kernel->contextId
164 << " stream=" << kernel->streamId << " correlation=" << kernel->correlationId;
165 ostr << " grid=[" << kernel->gridX << "," << kernel->gridY << "," << kernel->gridZ << "]"
166 << " block=[" << kernel->blockX << "," << kernel->blockY << "," << kernel->blockZ << "]"
167 << " shared memory (static=" << kernel->staticSharedMemory << " dynamic=" << kernel->dynamicSharedMemory << ")"
168 << " registers=" << kernel->registersPerThread
169 << " name=" << '"' << kernel->name << '"'
170 << "\n";
171 }
172 break;
173 }
174 case CUPTI_ACTIVITY_KIND_SOURCE_LOCATOR: {
175 auto* source_locator = reinterpret_cast<CUpti_ActivitySourceLocator*>(record);
176 if (do_print) {
177 ostr << "Source Locator Id " << source_locator->id
178 << " File " << source_locator->fileName
179 << " Line " << source_locator->lineNumber
180 << "\n";
181 }
182 break;
183 }
184 case CUPTI_ACTIVITY_KIND_PC_SAMPLING: {
185 auto* ps_record = reinterpret_cast<CUpti_ActivityPCSampling3*>(record);
186
187 if (do_print) {
188 ostr << "source " << ps_record->sourceLocatorId << " functionId " << ps_record->functionId
189 << " pc " << ps_record->pcOffset << " correlation " << ps_record->correlationId
190 << " samples " << ps_record->samples
191 << " latency samples " << ps_record->latencySamples
192 << " stallreason " << getStallReasonString(ps_record->stallReason)
193 << "\n";
194 }
195 break;
196 }
197 case CUPTI_ACTIVITY_KIND_PC_SAMPLING_RECORD_INFO: {
198 auto* pcsri_result = reinterpret_cast<CUpti_ActivityPCSamplingRecordInfo*>(record);
199
200 if (do_print) {
201 ostr << "correlation " << pcsri_result->correlationId
202 << " totalSamples " << pcsri_result->totalSamples
203 << " droppedSamples " << pcsri_result->droppedSamples
204 << " samplingPeriodInCycles " << pcsri_result->samplingPeriodInCycles
205 << "\n";
206 }
207 break;
208 }
209 case CUPTI_ACTIVITY_KIND_FUNCTION: {
210 auto* func_result = reinterpret_cast<CUpti_ActivityFunction*>(record);
211
212 if (do_print) {
213 ostr << "id " << func_result->id << " ctx " << func_result->contextId
214 << " moduleId " << func_result->moduleId
215 << " functionIndex " << func_result->functionIndex
216 << " name " << func_result->name
217 << "\n";
218 }
219 break;
220 }
221
222 default:
223 if (do_print) {
224 ostr << " <unknown>\n";
225 }
226 break;
227 }
228}
229
230/*---------------------------------------------------------------------------*/
231/*---------------------------------------------------------------------------*/
232
233static void CUPTIAPI
234arcaneCuptiBufferRequested(uint8_t** buffer, size_t* size, size_t* maxNumRecords)
235{
236 const Int32 BUF_SIZE = 16 * 4096;
237
238 // TODO: utiliser un ou plusieurs buffers pré-alloués pour éviter les
239 // successions d'allocations/désallocations.
240 *size = BUF_SIZE;
241 *buffer = new (std::align_val_t{ 8 }) uint8_t[BUF_SIZE];
242 *maxNumRecords = 0;
243}
244
245/*---------------------------------------------------------------------------*/
246/*---------------------------------------------------------------------------*/
247
248static void CUPTIAPI
249arcaneCuptiBufferCompleted(CUcontext ctx, uint32_t stream_id, uint8_t* buffer,
250 [[maybe_unused]] size_t size, size_t validSize)
251{
252 // NOTE: il semble que cette méthode soit toujours appelée depuis
253 // un thread spécifique créé par le runtime CUDA.
254
255 CUptiResult status;
256 CUpti_Activity* record = nullptr;
257
258 AcceleratorStatInfoList* stat_info = ProfilingRegistry::_threadLocalAcceleratorInstance();
259 std::ostringstream ostr;
260 do {
261 status = cuptiActivityGetNextRecord(buffer, validSize, &record);
262 if (status == CUPTI_SUCCESS) {
263 printActivity(stat_info, record, global_do_print, ostr);
264 }
265 else if (status == CUPTI_ERROR_MAX_LIMIT_REACHED) {
266 break;
267 }
268 else {
269 ARCANE_CHECK_CUDA(status);
270 }
271 } while (1);
272 std::cout << ostr.str();
273 // report any records dropped from the queue
274 size_t nb_dropped = 0;
275 ARCANE_CHECK_CUDA(cuptiActivityGetNumDroppedRecords(ctx, stream_id, &nb_dropped));
276 if (nb_dropped != 0)
277 std::cout << "WARNING: Dropped " << nb_dropped << " activity records\n";
278
279 delete[] buffer;
280}
281
282/*---------------------------------------------------------------------------*/
283/*---------------------------------------------------------------------------*/
284
285void CuptiInfo::
286start()
287{
288 if (m_is_active)
289 return;
290
291 global_do_print = m_do_print;
292
293 int device_id = 0;
294 cudaGetDevice(&device_id);
295 int level = m_profiling_level;
296
297 ARCANE_CHECK_CUDA(cuptiActivityRegisterCallbacks(arcaneCuptiBufferRequested, arcaneCuptiBufferCompleted));
298
299 config[0].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
300 config[0].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_HTOD;
301 config[0].deviceId = device_id;
302 config[0].enable = 1;
303
304 config[1].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
305 config[1].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_BYTES_TRANSFER_DTOH;
306 config[1].deviceId = device_id;
307 config[1].enable = 1;
308
309 config[2].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
310 config[2].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_CPU_PAGE_FAULT_COUNT;
311 config[2].deviceId = device_id;
312 config[2].enable = 1;
313
314 config[3].scope = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_SCOPE_PROCESS_SINGLE_DEVICE;
315 config[3].kind = CUPTI_ACTIVITY_UNIFIED_MEMORY_COUNTER_KIND_GPU_PAGE_FAULT;
316 config[3].deviceId = device_id;
317 config[3].enable = 1;
318
319 ARCANE_CHECK_CUDA(cuptiActivityConfigureUnifiedMemoryCounter(config.data(), config.size()));
320
321 // NOTE: un seul processus peut utiliser le sampling. Si on utilise MPI avec plusieurs
322 // rangs il ne faut pas activer le sampling
323 if (level >= 3) {
324 configPC.size = sizeof(CUpti_ActivityPCSamplingConfig);
325 configPC.samplingPeriod = CUPTI_ACTIVITY_PC_SAMPLING_PERIOD_MIN;
326 configPC.samplingPeriod2 = 0;
327 CUcontext cuCtx;
328 cuCtxGetCurrent(&cuCtx);
329 ARCANE_CHECK_CUDA(cuptiActivityConfigurePCSampling(cuCtx, &configPC));
330 }
331
332 // Active les compteurs
333 // CONCURRENT_KERNEL et PC_SAMPLING ne sont pas compatibles
334 // Si on ajoute des compteurs ici il faut les désactiver dans stop()
335 if (level >= 1)
336 ARCANE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
337 if (level == 2)
338 ARCANE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
339 if (level >= 3)
340 ARCANE_CHECK_CUDA(cuptiActivityEnable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
341
342 ARCANE_CHECK_CUDA(cuptiGetTimestamp(&startTimestamp));
343
344 // Mettre à la fin pour qu'en cas d'exception on considère l'initialisation
345 // non effectuée.
346 m_is_active = true;
347}
348
349/*---------------------------------------------------------------------------*/
350/*---------------------------------------------------------------------------*/
351
352void CuptiInfo::
353stop()
354{
355 if (!m_is_active)
356 return;
357 int level = m_profiling_level;
358
359 if (level >= 1)
360 ARCANE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_UNIFIED_MEMORY_COUNTER));
361 if (level == 2)
362 ARCANE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL));
363 if (level >= 3)
364 ARCANE_CHECK_CUDA(cuptiActivityDisable(CUPTI_ACTIVITY_KIND_PC_SAMPLING));
365
366 ARCANE_CHECK_CUDA(cuptiActivityFlushAll(0));
367 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
368
369 m_is_active = false;
370}
371
372/*---------------------------------------------------------------------------*/
373/*---------------------------------------------------------------------------*/
374
375void CuptiInfo::
376flush()
377{
378 // Il ne faut pas faire de flush si CUPTI n'a pas démarré car cela provoque
379 // une erreur.
380 if (!m_is_active)
381 return;
382 ARCANE_CHECK_CUDA(cuptiActivityFlushAll(0));
383}
384
385/*---------------------------------------------------------------------------*/
386/*---------------------------------------------------------------------------*/
387
388extern "C++" void
389initCupti(bool do_print)
390{
391 global_do_print = do_print;
392}
393
394/*---------------------------------------------------------------------------*/
395/*---------------------------------------------------------------------------*/
396
397} // namespace Arcane::Accelerator::Cuda
398
399/*---------------------------------------------------------------------------*/
400/*---------------------------------------------------------------------------*/
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Definition Lima.cc:149
static impl::AcceleratorStatInfoList * _threadLocalAcceleratorInstance()
Definition Profiling.cc:203
Statistiques pour les accélérateurs.