Arcane  v4.1.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2025 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/accelerator/cuda/CudaAccelerator.h"
15
16#include "arccore/base/MemoryView.h"
17#include "arccore/base/PlatformUtils.h"
18#include "arccore/base/TraceInfo.h"
19#include "arccore/base/NotSupportedException.h"
20#include "arccore/base/FatalErrorException.h"
21#include "arccore/base/NotImplementedException.h"
22
23#include "arccore/common/IMemoryResourceMng.h"
24#include "arccore/common/internal/IMemoryResourceMngInternal.h"
25
26#include "arcane/utils/Array.h"
27#include "arcane/utils/OStringStream.h"
28#include "arcane/utils/ValueConvert.h"
29#include "arcane/utils/CheckedConvert.h"
30#include "arcane/utils/internal/MemoryUtilsInternal.h"
31
32#include "arcane/accelerator/core/RunQueueBuildInfo.h"
33#include "arcane/accelerator/core/Memory.h"
34#include "arcane/accelerator/core/DeviceInfoList.h"
35#include "arcane/accelerator/core/KernelLaunchArgs.h"
36
37#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
38#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
39#include "arcane/accelerator/core/internal/RunCommandImpl.h"
40#include "arcane/accelerator/core/internal/IRunQueueStream.h"
41#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
42#include "arcane/accelerator/core/PointerAttribute.h"
43#include "arcane/accelerator/core/RunQueue.h"
44#include "arcane/accelerator/core/DeviceMemoryInfo.h"
45#include "arcane/accelerator/core/NativeStream.h"
46
47#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
48
49#include <iostream>
50#include <unordered_map>
51#include <mutex>
52
53#include <cuda.h>
54
55#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
56#include <nvtx3/nvToolsExt.h>
57#endif
58
59using namespace Arccore;
60
61namespace Arcane::Accelerator::Cuda
62{
63using Impl::KernelLaunchArgs;
64
65namespace
66{
67 Int32 global_cupti_flush = 0;
68 CuptiInfo global_cupti_info;
69} // namespace
70
71/*---------------------------------------------------------------------------*/
72/*---------------------------------------------------------------------------*/
73
74void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
75{
76 if (e == CUDA_SUCCESS)
77 return;
78 const char* error_name = nullptr;
79 CUresult e2 = cuGetErrorName(e, &error_name);
80 if (e2 != CUDA_SUCCESS)
81 error_name = "Unknown";
82
83 const char* error_message = nullptr;
84 CUresult e3 = cuGetErrorString(e, &error_message);
85 if (e3 != CUDA_SUCCESS)
86 error_message = "Unknown";
87
88 ARCANE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
89 ti, e, error_name, error_message);
90}
91
92/*---------------------------------------------------------------------------*/
93/*---------------------------------------------------------------------------*/
103{
104 public:
105
106 Int32 getNbThreadPerBlock(const void* kernel_ptr)
107 {
108 std::scoped_lock lock(m_mutex);
109 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
110 if (x != m_nb_thread_per_block_map.end())
111 return x->second;
112 int min_grid_size = 0;
113 int computed_block_size = 0;
114 int wanted_shared_memory = 0;
115 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
116 if (r != cudaSuccess)
117 computed_block_size = 0;
118 int num_block_0 = 0;
119 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
120 int num_block_1 = 0;
121 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
122
123 cudaFuncAttributes func_attr;
124 cudaFuncGetAttributes(&func_attr, kernel_ptr);
125 const char* func_name = nullptr;
126 cudaFuncGetName(&func_name, kernel_ptr);
127 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
128 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
129 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs
130 << " name=" << func_name << "\n";
131 return computed_block_size;
132 }
133
134 private:
135
136 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
137 std::mutex m_mutex;
138};
139
140/*---------------------------------------------------------------------------*/
141/*---------------------------------------------------------------------------*/
142
143class CudaRunQueueStream
145{
146 public:
147
148 CudaRunQueueStream(impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
149 : m_runtime(runtime)
150 {
151 if (bi.isDefault())
152 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
153 else {
154 int priority = bi.priority();
155 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
156 }
157 }
158 ~CudaRunQueueStream() override
159 {
160 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
161 }
162
163 public:
164
165 void notifyBeginLaunchKernel([[maybe_unused]] impl::RunCommandImpl& c) override
166 {
167#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
168 auto kname = c.kernelName();
169 if (kname.empty())
170 nvtxRangePush(c.traceInfo().name());
171 else
172 nvtxRangePush(kname.localstr());
173#endif
174 return m_runtime->notifyBeginLaunchKernel();
175 }
177 {
178#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
179 nvtxRangePop();
180#endif
181 return m_runtime->notifyEndLaunchKernel();
182 }
183 void barrier() override
184 {
185 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
186 if (global_cupti_flush > 0)
187 global_cupti_info.flush();
188 }
189 bool _barrierNoException() override
190 {
191 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
192 }
193 void copyMemory(const MemoryCopyArgs& args) override
194 {
195 auto source_bytes = args.source().bytes();
196 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
197 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
198 ARCANE_CHECK_CUDA(r);
199 if (!args.isAsync())
200 barrier();
201 }
202 void prefetchMemory(const MemoryPrefetchArgs& args) override
203 {
204 auto src = args.source().bytes();
205 if (src.size() == 0)
206 return;
207 DeviceId d = args.deviceId();
208 int device = cudaCpuDeviceId;
209 if (!d.isHost())
210 device = d.asInt32();
211 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
212 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
213 auto mem_location = _getMemoryLocation(device);
214#if defined(ARCANE_USING_CUDA13_OR_GREATER)
215 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
216#else
217 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
218#endif
219 ARCANE_CHECK_CUDA(r);
220 if (!args.isAsync())
221 barrier();
222 }
224 {
225 return Impl::NativeStream(&m_cuda_stream);
226 }
227
228 public:
229
230 cudaStream_t trueStream() const
231 {
232 return m_cuda_stream;
233 }
234
235 private:
236
237 impl::IRunnerRuntime* m_runtime = nullptr;
238 cudaStream_t m_cuda_stream = nullptr;
239};
240
241/*---------------------------------------------------------------------------*/
242/*---------------------------------------------------------------------------*/
243
244class CudaRunQueueEvent
246{
247 public:
248
249 explicit CudaRunQueueEvent(bool has_timer)
250 {
251 if (has_timer)
252 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
253 else
254 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
255 }
256 ~CudaRunQueueEvent() override
257 {
258 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
259 }
260
261 public:
262
263 // Enregistre l'événement au sein d'une RunQueue
264 void recordQueue(impl::IRunQueueStream* stream) final
265 {
266 auto* rq = static_cast<CudaRunQueueStream*>(stream);
267 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
268 }
269
270 void wait() final
271 {
272 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
273 }
274
275 void waitForEvent(impl::IRunQueueStream* stream) final
276 {
277 auto* rq = static_cast<CudaRunQueueStream*>(stream);
278 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
279 }
280
281 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
282 {
283 // NOTE: Les évènements doivent avoir été créé avec le timer actif
284 ARCANE_CHECK_POINTER(start_event);
285 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
286 float time_in_ms = 0.0;
287
288 // TODO: regarder si nécessaire
289 // ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
290
291 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
292 double x = time_in_ms * 1.0e6;
293 Int64 nano_time = static_cast<Int64>(x);
294 return nano_time;
295 }
296
297 bool hasPendingWork() final
298 {
299 cudaError_t v = cudaEventQuery(m_cuda_event);
300 if (v == cudaErrorNotReady)
301 return true;
302 ARCANE_CHECK_CUDA(v);
303 return false;
304 }
305
306 private:
307
308 cudaEvent_t m_cuda_event;
309};
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
313
316{
317 public:
318
319 ~CudaRunnerRuntime() override = default;
320
321 public:
322
323 void notifyBeginLaunchKernel() override
324 {
325 ++m_nb_kernel_launched;
326 if (m_is_verbose)
327 std::cout << "BEGIN CUDA KERNEL!\n";
328 }
329 void notifyEndLaunchKernel() override
330 {
331 ARCANE_CHECK_CUDA(cudaGetLastError());
332 if (m_is_verbose)
333 std::cout << "END CUDA KERNEL!\n";
334 }
335 void barrier() override
336 {
337 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
338 }
339 eExecutionPolicy executionPolicy() const override
340 {
342 }
343 impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
344 {
345 return new CudaRunQueueStream(this, bi);
346 }
347 impl::IRunQueueEventImpl* createEventImpl() override
348 {
349 return new CudaRunQueueEvent(false);
350 }
351 impl::IRunQueueEventImpl* createEventImplWithTimer() override
352 {
353 return new CudaRunQueueEvent(true);
354 }
355 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
356 {
357 auto v = buffer.bytes();
358 const void* ptr = v.data();
359 size_t count = v.size();
360 int device = device_id.asInt32();
361 cudaMemoryAdvise cuda_advise;
362
363 if (advice == eMemoryAdvice::MostlyRead)
364 cuda_advise = cudaMemAdviseSetReadMostly;
366 cuda_advise = cudaMemAdviseSetPreferredLocation;
367 else if (advice == eMemoryAdvice::AccessedByDevice)
368 cuda_advise = cudaMemAdviseSetAccessedBy;
369 else if (advice == eMemoryAdvice::PreferredLocationHost) {
370 cuda_advise = cudaMemAdviseSetPreferredLocation;
371 device = cudaCpuDeviceId;
372 }
373 else if (advice == eMemoryAdvice::AccessedByHost) {
374 cuda_advise = cudaMemAdviseSetAccessedBy;
375 device = cudaCpuDeviceId;
376 }
377 else
378 return;
379 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
380 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
381 }
382 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
383 {
384 auto v = buffer.bytes();
385 const void* ptr = v.data();
386 size_t count = v.size();
387 int device = device_id.asInt32();
388 cudaMemoryAdvise cuda_advise;
389
390 if (advice == eMemoryAdvice::MostlyRead)
391 cuda_advise = cudaMemAdviseUnsetReadMostly;
393 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
394 else if (advice == eMemoryAdvice::AccessedByDevice)
395 cuda_advise = cudaMemAdviseUnsetAccessedBy;
396 else if (advice == eMemoryAdvice::PreferredLocationHost) {
397 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
398 device = cudaCpuDeviceId;
399 }
400 else if (advice == eMemoryAdvice::AccessedByHost) {
401 cuda_advise = cudaMemAdviseUnsetAccessedBy;
402 device = cudaCpuDeviceId;
403 }
404 else
405 return;
406 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
407 }
408
409 void setCurrentDevice(DeviceId device_id) final
410 {
411 Int32 id = device_id.asInt32();
412 if (!device_id.isAccelerator())
413 ARCANE_FATAL("Device {0} is not an accelerator device", id);
414 ARCANE_CHECK_CUDA(cudaSetDevice(id));
415 }
416
417 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
418
419 void startProfiling() override
420 {
421 global_cupti_info.start();
422 }
423
424 void stopProfiling() override
425 {
426 global_cupti_info.stop();
427 }
428
429 bool isProfilingActive() override
430 {
431 return global_cupti_info.isActive();
432 }
433
434 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
435 {
436 cudaPointerAttributes ca;
437 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
438 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
439 // que le type CUDA correspondant donc on peut faire un cast simple.
440 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
441 _fillPointerAttribute(attribute, mem_type, ca.device,
442 ptr, ca.devicePointer, ca.hostPointer);
443 }
444
445 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
446 {
447 int d = 0;
448 int wanted_d = device_id.asInt32();
449 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
450 if (d != wanted_d)
451 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
452 size_t free_mem = 0;
453 size_t total_mem = 0;
454 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
455 if (d != wanted_d)
456 ARCANE_CHECK_CUDA(cudaSetDevice(d));
458 dmi.setFreeMemory(free_mem);
459 dmi.setTotalMemory(total_mem);
460 return dmi;
461 }
462
463 void pushProfilerRange(const String& name, Int32 color_rgb) override
464 {
465#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
466 if (color_rgb >= 0) {
467 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
468 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
469 nvtxEventAttributes_t eventAttrib;
470 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
471 eventAttrib.version = NVTX_VERSION;
472 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
473 eventAttrib.colorType = NVTX_COLOR_ARGB;
474 eventAttrib.color = color_rgb;
475 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
476 eventAttrib.message.ascii = name.localstr();
477 nvtxRangePushEx(&eventAttrib);
478 }
479 else
480 nvtxRangePush(name.localstr());
481#endif
482 }
483 void popProfilerRange() override
484 {
485#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
486 nvtxRangePop();
487#endif
488 }
489
490 void finalize(ITraceMng* tm) override
491 {
492 finalizeCudaMemoryAllocators(tm);
493 }
494
495 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
496 const void* kernel_ptr,
497 Int64 total_loop_size,
498 Int32 wanted_shared_memory) override
499 {
500 if (!m_use_computed_occupancy)
501 return orig_args;
502 if (wanted_shared_memory < 0)
503 wanted_shared_memory = 0;
504 // Pour l'instant, on ne fait pas de calcul si la mémoire partagée est non nulle.
505 if (wanted_shared_memory != 0)
506 return orig_args;
507 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
508 if (computed_block_size == 0)
509 return orig_args;
510 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
511 int blocks_per_grid = CheckedConvert::toInt32(big_b);
512 return { blocks_per_grid, computed_block_size };
513 }
514
515 public:
516
517 void fillDevices(bool is_verbose);
518 void build()
519 {
520 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
521 m_use_computed_occupancy = v.value();
522 }
523
524 private:
525
526 Int64 m_nb_kernel_launched = 0;
527 bool m_is_verbose = false;
528 bool m_use_computed_occupancy = false;
529 impl::DeviceInfoList m_device_info_list;
530 OccupancyMap m_occupancy_map;
531};
532
533/*---------------------------------------------------------------------------*/
534/*---------------------------------------------------------------------------*/
535
536void CudaRunnerRuntime::
537fillDevices(bool is_verbose)
538{
539 int nb_device = 0;
540 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
541 std::ostream& omain = std::cout;
542 if (is_verbose)
543 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
544 for (int i = 0; i < nb_device; ++i) {
545 cudaDeviceProp dp;
546 cudaGetDeviceProperties(&dp, i);
547 int runtime_version = 0;
548 cudaRuntimeGetVersion(&runtime_version);
549 int driver_version = 0;
550 cudaDriverGetVersion(&driver_version);
551 OStringStream ostr;
552 std::ostream& o = ostr.stream();
553 o << "Device " << i << " name=" << dp.name << "\n";
554 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
555 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
556 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
557 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
558 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
559 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
560 o << " warpSize = " << dp.warpSize << "\n";
561 o << " memPitch = " << dp.memPitch << "\n";
562 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
563 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
564 o << " totalConstMem = " << dp.totalConstMem << "\n";
565 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
566 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
567 o << " integrated = " << dp.integrated << "\n";
568 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
569 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
570 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
571 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
572 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
573 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
574 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
575 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
576 << " " << dp.maxThreadsDim[2] << "\n";
577 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
578 << " " << dp.maxGridSize[2] << "\n";
579#if !defined(ARCANE_USING_CUDA13_OR_GREATER)
580 o << " clockRate = " << dp.clockRate << "\n";
581 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
582 o << " computeMode = " << dp.computeMode << "\n";
583 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
584#endif
585
586 {
587 int least_val = 0;
588 int greatest_val = 0;
589 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
590 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
591 }
592 {
593 CUdevice device;
594 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
595 CUuuid device_uuid;
596 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
597 o << " deviceUuid=";
598 impl::printUUID(o, device_uuid.bytes);
599 o << "\n";
600 }
601 String description(ostr.str());
602 if (is_verbose)
603 omain << description;
604
605 DeviceInfo device_info;
606 device_info.setDescription(description);
607 device_info.setDeviceId(DeviceId(i));
608 device_info.setName(dp.name);
609 device_info.setWarpSize(dp.warpSize);
610 m_device_info_list.addDevice(device_info);
611 }
612
613 Int32 global_cupti_level = 0;
614
615 // Regarde si on active Cupti
616 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
617 global_cupti_level = v.value();
618 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
619 global_cupti_flush = v.value();
620 bool do_print_cupti = true;
621 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
622 do_print_cupti = (v.value() != 0);
623
624 if (global_cupti_level > 0) {
625#ifndef ARCANE_HAS_CUDA_CUPTI
626 ARCANE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
627#endif
628 global_cupti_info.init(global_cupti_level, do_print_cupti);
629 global_cupti_info.start();
630 }
631}
632
633/*---------------------------------------------------------------------------*/
634/*---------------------------------------------------------------------------*/
635
637: public IMemoryCopier
638{
639 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryRessource from_mem,
640 MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem,
641 const RunQueue* queue) override
642 {
643 if (queue) {
644 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
645 return;
646 }
647 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
648 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
649 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
650 ARCANE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
651 }
652};
653
654/*---------------------------------------------------------------------------*/
655/*---------------------------------------------------------------------------*/
656
657} // End namespace Arcane::Accelerator::Cuda
658
659namespace
660{
662Arcane::Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
663} // namespace
664
665/*---------------------------------------------------------------------------*/
666/*---------------------------------------------------------------------------*/
667
668// Cette fonction est le point d'entrée utilisé lors du chargement
669// dynamique de cette bibliothèque
670extern "C" ARCANE_EXPORT void
671arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
672{
673 using namespace Arcane;
674 using namespace Arcane::Accelerator::Cuda;
675 global_cuda_runtime.build();
676 Arcane::Accelerator::impl::setUsingCUDARuntime(true);
677 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
678 initializeCudaMemoryAllocators();
680 MemoryUtils::setAcceleratorHostMemoryAllocator(getCudaUnifiedMemoryAllocator());
682 mrm->setIsAccelerator(true);
683 mrm->setAllocator(eMemoryRessource::UnifiedMemory, getCudaUnifiedMemoryAllocator());
684 mrm->setAllocator(eMemoryRessource::HostPinned, getCudaHostPinnedMemoryAllocator());
685 mrm->setAllocator(eMemoryRessource::Device, getCudaDeviceMemoryAllocator());
686 mrm->setCopier(&global_cuda_memory_copier);
687 global_cuda_runtime.fillDevices(init_info.isVerbose());
688}
689
690/*---------------------------------------------------------------------------*/
691/*---------------------------------------------------------------------------*/
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
Impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Definition Cupti.h:38
Map contenant l'occupation idéale pour un kernel donné.
Identifiant d'un composant du système.
Definition DeviceId.h:33
bool isHost() const
Indique si l'instance est associée à l'hôte.
Definition DeviceId.h:60
Int32 asInt32() const
Valeur numérique du device.
Definition DeviceId.h:69
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Definition DeviceId.h:66
Information sur un device.
Definition DeviceInfo.h:32
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Arguments pour la copie mémoire.
Definition Memory.h:61
Arguments pour le préfetching mémoire.
Definition Memory.h:123
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr SpanType bytes() const
Vue sous forme d'octets.
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
static ARCCORE_BASE_EXPORT std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Classe template pour convertir un type.
Interface pour les copies mémoire avec support des accélérateurs.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual IMemoryResourceMngInternal * _internal()=0
Interface interne.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
constexpr SpanType bytes() const
Vue sous forme d'octets.
Flot de sortie lié à une String.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:516
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:304
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
eMemoryAdvice
Conseils pour la gestion mémoire.
Definition Memory.h:34
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
Definition Memory.h:46
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
Definition Memory.h:40
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
Definition Memory.h:38
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
Definition Memory.h:42
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
Definition Memory.h:44
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
std::int32_t Int32
Type entier signé sur 32 bits.
Espace de nom de Arccore.