Arcane  v4.1.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2025 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/accelerator/cuda/CudaAccelerator.h"
15
16#include "arccore/base/MemoryView.h"
17#include "arccore/base/PlatformUtils.h"
18#include "arccore/base/TraceInfo.h"
19#include "arccore/base/NotSupportedException.h"
20#include "arccore/base/FatalErrorException.h"
21#include "arccore/base/NotImplementedException.h"
22
23#include "arccore/common/IMemoryResourceMng.h"
24#include "arccore/common/internal/IMemoryResourceMngInternal.h"
25
26#include "arcane/utils/Array.h"
27#include "arcane/utils/OStringStream.h"
28#include "arcane/utils/ValueConvert.h"
29#include "arcane/utils/CheckedConvert.h"
30#include "arcane/utils/internal/MemoryUtilsInternal.h"
31
32#include "arcane/accelerator/core/RunQueueBuildInfo.h"
33#include "arcane/accelerator/core/Memory.h"
34#include "arcane/accelerator/core/DeviceInfoList.h"
35#include "arcane/accelerator/core/KernelLaunchArgs.h"
36
37#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
38#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
39#include "arcane/accelerator/core/internal/RunCommandImpl.h"
40#include "arcane/accelerator/core/internal/IRunQueueStream.h"
41#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
42#include "arcane/accelerator/core/PointerAttribute.h"
43#include "arcane/accelerator/core/RunQueue.h"
44#include "arcane/accelerator/core/DeviceMemoryInfo.h"
45#include "arcane/accelerator/core/NativeStream.h"
46
47#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
48
49#include <iostream>
50#include <unordered_map>
51#include <mutex>
52
53#include <cuda.h>
54
55#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
56#include <nvtx3/nvToolsExt.h>
57#endif
58
59using namespace Arccore;
60
61namespace Arcane::Accelerator::Cuda
62{
63using impl::KernelLaunchArgs;
64
65namespace
66{
67 Int32 global_cupti_flush = 0;
68 CuptiInfo global_cupti_info;
69} // namespace
70
71/*---------------------------------------------------------------------------*/
72/*---------------------------------------------------------------------------*/
73
74void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
75{
76 if (e == CUDA_SUCCESS)
77 return;
78 const char* error_name = nullptr;
79 CUresult e2 = cuGetErrorName(e, &error_name);
80 if (e2 != CUDA_SUCCESS)
81 error_name = "Unknown";
82
83 const char* error_message = nullptr;
84 CUresult e3 = cuGetErrorString(e, &error_message);
85 if (e3 != CUDA_SUCCESS)
86 error_message = "Unknown";
87
88 ARCANE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
89 ti, e, error_name, error_message);
90}
91
92/*---------------------------------------------------------------------------*/
93/*---------------------------------------------------------------------------*/
103{
104 public:
105
106 Int32 getNbThreadPerBlock(const void* kernel_ptr)
107 {
108 std::scoped_lock lock(m_mutex);
109 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
110 if (x != m_nb_thread_per_block_map.end())
111 return x->second;
112 int min_grid_size = 0;
113 int computed_block_size = 0;
114 int wanted_shared_memory = 0;
115 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
116 if (r != cudaSuccess)
117 computed_block_size = 0;
118 int num_block_0 = 0;
119 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
120 int num_block_1 = 0;
121 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
122
123 cudaFuncAttributes func_attr;
124 cudaFuncGetAttributes(&func_attr, kernel_ptr);
125 const char* func_name = nullptr;
126 cudaFuncGetName(&func_name, kernel_ptr);
127 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
128 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
129 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs
130 << " name=" << func_name << "\n";
131 return computed_block_size;
132 }
133
134 private:
135
136 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
137 std::mutex m_mutex;
138};
139
140/*---------------------------------------------------------------------------*/
141/*---------------------------------------------------------------------------*/
142
143class CudaRunQueueStream
145{
146 public:
147
148 CudaRunQueueStream(impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
149 : m_runtime(runtime)
150 {
151 if (bi.isDefault())
152 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
153 else {
154 int priority = bi.priority();
155 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
156 }
157 }
158 ~CudaRunQueueStream() override
159 {
160 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
161 }
162
163 public:
164
165 void notifyBeginLaunchKernel([[maybe_unused]] impl::RunCommandImpl& c) override
166 {
167#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
168 auto kname = c.kernelName();
169 if (kname.empty())
170 nvtxRangePush(c.traceInfo().name());
171 else
172 nvtxRangePush(kname.localstr());
173#endif
174 return m_runtime->notifyBeginLaunchKernel();
175 }
177 {
178#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
179 nvtxRangePop();
180#endif
181 return m_runtime->notifyEndLaunchKernel();
182 }
183 void barrier() override
184 {
185 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
186 if (global_cupti_flush > 0)
187 global_cupti_info.flush();
188 }
189 bool _barrierNoException() override
190 {
191 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
192 }
193 void copyMemory(const MemoryCopyArgs& args) override
194 {
195 auto source_bytes = args.source().bytes();
196 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
197 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
198 ARCANE_CHECK_CUDA(r);
199 if (!args.isAsync())
200 barrier();
201 }
202 void prefetchMemory(const MemoryPrefetchArgs& args) override
203 {
204 auto src = args.source().bytes();
205 if (src.size() == 0)
206 return;
207 DeviceId d = args.deviceId();
208 int device = cudaCpuDeviceId;
209 if (!d.isHost())
210 device = d.asInt32();
211 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
212 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
213 auto mem_location = _getMemoryLocation(device);
214#if defined(ARCANE_USING_CUDA13_OR_GREATER)
215 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
216#else
217 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
218#endif
219 ARCANE_CHECK_CUDA(r);
220 if (!args.isAsync())
221 barrier();
222 }
224 {
225 return impl::NativeStream(&m_cuda_stream);
226 }
227
228 public:
229
230 cudaStream_t trueStream() const
231 {
232 return m_cuda_stream;
233 }
234
235 private:
236
237 impl::IRunnerRuntime* m_runtime = nullptr;
238 cudaStream_t m_cuda_stream = nullptr;
239};
240
241/*---------------------------------------------------------------------------*/
242/*---------------------------------------------------------------------------*/
243
244class CudaRunQueueEvent
246{
247 public:
248
249 explicit CudaRunQueueEvent(bool has_timer)
250 {
251 if (has_timer)
252 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
253 else
254 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
255 }
256 ~CudaRunQueueEvent() override
257 {
258 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
259 }
260
261 public:
262
263 // Enregistre l'événement au sein d'une RunQueue
264 void recordQueue(impl::IRunQueueStream* stream) final
265 {
266 auto* rq = static_cast<CudaRunQueueStream*>(stream);
267 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
268 }
269
270 void wait() final
271 {
272 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
273 }
274
275 void waitForEvent(impl::IRunQueueStream* stream) final
276 {
277 auto* rq = static_cast<CudaRunQueueStream*>(stream);
278 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
279 }
280
281 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
282 {
283 // NOTE: Les évènements doivent avoir été créé avec le timer actif
284 ARCANE_CHECK_POINTER(start_event);
285 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
286 float time_in_ms = 0.0;
287
288 // TODO: regarder si nécessaire
289 // ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
290
291 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
292 double x = time_in_ms * 1.0e6;
293 Int64 nano_time = static_cast<Int64>(x);
294 return nano_time;
295 }
296
297 bool hasPendingWork() final
298 {
299 cudaError_t v = cudaEventQuery(m_cuda_event);
300 if (v == cudaErrorNotReady)
301 return true;
302 ARCANE_CHECK_CUDA(v);
303 return false;
304 }
305
306 private:
307
308 cudaEvent_t m_cuda_event;
309};
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
313
316{
317 public:
318
319 ~CudaRunnerRuntime() override = default;
320
321 public:
322
323 void notifyBeginLaunchKernel() override
324 {
325 ++m_nb_kernel_launched;
326 if (m_is_verbose)
327 std::cout << "BEGIN CUDA KERNEL!\n";
328 }
329 void notifyEndLaunchKernel() override
330 {
331 ARCANE_CHECK_CUDA(cudaGetLastError());
332 if (m_is_verbose)
333 std::cout << "END CUDA KERNEL!\n";
334 }
335 void barrier() override
336 {
337 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
338 }
339 eExecutionPolicy executionPolicy() const override
340 {
342 }
343 impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
344 {
345 return new CudaRunQueueStream(this, bi);
346 }
347 impl::IRunQueueEventImpl* createEventImpl() override
348 {
349 return new CudaRunQueueEvent(false);
350 }
351 impl::IRunQueueEventImpl* createEventImplWithTimer() override
352 {
353 return new CudaRunQueueEvent(true);
354 }
355 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
356 {
357 auto v = buffer.bytes();
358 const void* ptr = v.data();
359 size_t count = v.size();
360 int device = device_id.asInt32();
361 cudaMemoryAdvise cuda_advise;
362
363 if (advice == eMemoryAdvice::MostlyRead)
364 cuda_advise = cudaMemAdviseSetReadMostly;
366 cuda_advise = cudaMemAdviseSetPreferredLocation;
367 else if (advice == eMemoryAdvice::AccessedByDevice)
368 cuda_advise = cudaMemAdviseSetAccessedBy;
369 else if (advice == eMemoryAdvice::PreferredLocationHost) {
370 cuda_advise = cudaMemAdviseSetPreferredLocation;
371 device = cudaCpuDeviceId;
372 }
373 else if (advice == eMemoryAdvice::AccessedByHost) {
374 cuda_advise = cudaMemAdviseSetAccessedBy;
375 device = cudaCpuDeviceId;
376 }
377 else
378 return;
379 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
380 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
381 }
382 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
383 {
384 auto v = buffer.bytes();
385 const void* ptr = v.data();
386 size_t count = v.size();
387 int device = device_id.asInt32();
388 cudaMemoryAdvise cuda_advise;
389
390 if (advice == eMemoryAdvice::MostlyRead)
391 cuda_advise = cudaMemAdviseUnsetReadMostly;
393 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
394 else if (advice == eMemoryAdvice::AccessedByDevice)
395 cuda_advise = cudaMemAdviseUnsetAccessedBy;
396 else if (advice == eMemoryAdvice::PreferredLocationHost) {
397 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
398 device = cudaCpuDeviceId;
399 }
400 else if (advice == eMemoryAdvice::AccessedByHost) {
401 cuda_advise = cudaMemAdviseUnsetAccessedBy;
402 device = cudaCpuDeviceId;
403 }
404 else
405 return;
406 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
407 }
408
409 void setCurrentDevice(DeviceId device_id) final
410 {
411 Int32 id = device_id.asInt32();
412 if (!device_id.isAccelerator())
413 ARCANE_FATAL("Device {0} is not an accelerator device", id);
414 ARCANE_CHECK_CUDA(cudaSetDevice(id));
415 }
416
417 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
418
419 void startProfiling() override
420 {
421 global_cupti_info.start();
422 }
423
424 void stopProfiling() override
425 {
426 global_cupti_info.stop();
427 }
428
429 bool isProfilingActive() override
430 {
431 return global_cupti_info.isActive();
432 }
433
434 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
435 {
436 cudaPointerAttributes ca;
437 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
438 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
439 // que le type CUDA correspondant donc on peut faire un cast simple.
440 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
441 _fillPointerAttribute(attribute, mem_type, ca.device,
442 ptr, ca.devicePointer, ca.hostPointer);
443 }
444
445 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
446 {
447 int d = 0;
448 int wanted_d = device_id.asInt32();
449 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
450 if (d != wanted_d)
451 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
452 size_t free_mem = 0;
453 size_t total_mem = 0;
454 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
455 if (d != wanted_d)
456 ARCANE_CHECK_CUDA(cudaSetDevice(d));
458 dmi.setFreeMemory(free_mem);
459 dmi.setTotalMemory(total_mem);
460 return dmi;
461 }
462
463 void pushProfilerRange(const String& name, Int32 color_rgb) override
464 {
465#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
466 if (color_rgb >= 0) {
467 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
468 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
469 nvtxEventAttributes_t eventAttrib;
470 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
471 eventAttrib.version = NVTX_VERSION;
472 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
473 eventAttrib.colorType = NVTX_COLOR_ARGB;
474 eventAttrib.color = color_rgb;
475 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
476 eventAttrib.message.ascii = name.localstr();
477 nvtxRangePushEx(&eventAttrib);
478 }
479 else
480 nvtxRangePush(name.localstr());
481#endif
482 }
483 void popProfilerRange() override
484 {
485#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
486 nvtxRangePop();
487#endif
488 }
489
490 void finalize(ITraceMng* tm) override
491 {
492 finalizeCudaMemoryAllocators(tm);
493 }
494
495 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
496 const void* kernel_ptr,
497 Int64 total_loop_size,
498 Int32 wanted_shared_memory) override
499 {
500 if (!m_use_computed_occupancy)
501 return orig_args;
502 if (wanted_shared_memory < 0)
503 wanted_shared_memory = 0;
504 // Pour l'instant, on ne fait pas de calcul si la mémoire partagée est non nulle.
505 if (wanted_shared_memory != 0)
506 return orig_args;
507 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
508 if (computed_block_size == 0)
509 return orig_args;
510 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
511 int blocks_per_grid = CheckedConvert::toInt32(big_b);
512 return { blocks_per_grid, computed_block_size };
513 }
514
515 public:
516
517 void fillDevices(bool is_verbose);
518 void build()
519 {
520 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
521 m_use_computed_occupancy = v.value();
522 }
523
524 private:
525
526 Int64 m_nb_kernel_launched = 0;
527 bool m_is_verbose = false;
528 bool m_use_computed_occupancy = false;
529 impl::DeviceInfoList m_device_info_list;
530 OccupancyMap m_occupancy_map;
531};
532
533/*---------------------------------------------------------------------------*/
534/*---------------------------------------------------------------------------*/
535
536void CudaRunnerRuntime::
537fillDevices(bool is_verbose)
538{
539 int nb_device = 0;
540 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
541 std::ostream& omain = std::cout;
542 if (is_verbose)
543 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
544 for (int i = 0; i < nb_device; ++i) {
545 cudaDeviceProp dp;
546 cudaGetDeviceProperties(&dp, i);
547 int runtime_version = 0;
548 cudaRuntimeGetVersion(&runtime_version);
549 int driver_version = 0;
550 cudaDriverGetVersion(&driver_version);
551 OStringStream ostr;
552 std::ostream& o = ostr.stream();
553 o << "Device " << i << " name=" << dp.name << "\n";
554 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
555 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
556 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
557 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
558 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
559 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
560 o << " warpSize = " << dp.warpSize << "\n";
561 o << " memPitch = " << dp.memPitch << "\n";
562 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
563 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
564 o << " totalConstMem = " << dp.totalConstMem << "\n";
565 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
566 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
567 o << " integrated = " << dp.integrated << "\n";
568 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
569 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
570 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
571 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
572 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
573 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
574 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
575 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
576 << " " << dp.maxThreadsDim[2] << "\n";
577 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
578 << " " << dp.maxGridSize[2] << "\n";
579#if !defined(ARCANE_USING_CUDA13_OR_GREATER)
580 o << " clockRate = " << dp.clockRate << "\n";
581 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
582 o << " computeMode = " << dp.computeMode << "\n";
583 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
584#endif
585
586 {
587 int least_val = 0;
588 int greatest_val = 0;
589 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
590 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
591 }
592 {
593 CUdevice device;
594 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
595 CUuuid device_uuid;
596 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
597 o << " deviceUuid=";
598 impl::printUUID(o, device_uuid.bytes);
599 o << "\n";
600 }
601 String description(ostr.str());
602 if (is_verbose)
603 omain << description;
604
605 DeviceInfo device_info;
606 device_info.setDescription(description);
607 device_info.setDeviceId(DeviceId(i));
608 device_info.setName(dp.name);
609 device_info.setWarpSize(dp.warpSize);
610 m_device_info_list.addDevice(device_info);
611 }
612
613 Int32 global_cupti_level = 0;
614
615 // Regarde si on active Cupti
616 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
617 global_cupti_level = v.value();
618 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
619 global_cupti_flush = v.value();
620 bool do_print_cupti = true;
621 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
622 do_print_cupti = (v.value() != 0);
623
624 if (global_cupti_level > 0) {
625#ifndef ARCANE_HAS_CUDA_CUPTI
626 ARCANE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
627#endif
628 global_cupti_info.init(global_cupti_level, do_print_cupti);
629 global_cupti_info.start();
630 }
631}
632
633/*---------------------------------------------------------------------------*/
634/*---------------------------------------------------------------------------*/
635
637: public IMemoryCopier
638{
639 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryRessource from_mem,
640 MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem,
641 const RunQueue* queue) override
642 {
643 if (queue) {
644 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
645 return;
646 }
647 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
648 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
649 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
650 ARCANE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
651 }
652};
653
654/*---------------------------------------------------------------------------*/
655/*---------------------------------------------------------------------------*/
656
657} // End namespace Arcane::Accelerator::Cuda
658
659namespace
660{
662Arcane::Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
663} // namespace
664
665/*---------------------------------------------------------------------------*/
666/*---------------------------------------------------------------------------*/
667
668// Cette fonction est le point d'entrée utilisé lors du chargement
669// dynamique de cette bibliothèque
670extern "C" ARCANE_EXPORT void
671arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
672{
673 using namespace Arcane;
674 using namespace Arcane::Accelerator::Cuda;
675 global_cuda_runtime.build();
676 Arcane::Accelerator::impl::setUsingCUDARuntime(true);
677 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
678 initializeCudaMemoryAllocators();
680 MemoryUtils::setAcceleratorHostMemoryAllocator(getCudaUnifiedMemoryAllocator());
682 mrm->setIsAccelerator(true);
683 mrm->setAllocator(eMemoryRessource::UnifiedMemory, getCudaUnifiedMemoryAllocator());
684 mrm->setAllocator(eMemoryRessource::HostPinned, getCudaHostPinnedMemoryAllocator());
685 mrm->setAllocator(eMemoryRessource::Device, getCudaDeviceMemoryAllocator());
686 mrm->setCopier(&global_cuda_memory_copier);
687 global_cuda_runtime.fillDevices(init_info.isVerbose());
688}
689
690/*---------------------------------------------------------------------------*/
691/*---------------------------------------------------------------------------*/
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Definition Cupti.h:38
Map contenant l'occupation idéale pour un kernel donné.
Identifiant d'un composant du système.
Definition DeviceId.h:33
bool isHost() const
Indique si l'instance est associée à l'hôte.
Definition DeviceId.h:60
Int32 asInt32() const
Valeur numérique du device.
Definition DeviceId.h:69
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Definition DeviceId.h:66
Information sur un device.
Definition DeviceInfo.h:32
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Definition Memory.h:61
Arguments pour le préfetching mémoire.
Definition Memory.h:123
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
static ARCCORE_BASE_EXPORT std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
Flot de sortie lié à une String.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:516
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:304
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
eMemoryAdvice
Conseils pour la gestion mémoire.
Definition Memory.h:34
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
Definition Memory.h:46
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
Definition Memory.h:40
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
Definition Memory.h:38
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
Definition Memory.h:42
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
Definition Memory.h:44
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')