Arcane  v3.16.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2025 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/accelerator/cuda/CudaAccelerator.h"
15
16#include "arcane/utils/PlatformUtils.h"
17#include "arcane/utils/Array.h"
18#include "arcane/utils/TraceInfo.h"
19#include "arcane/utils/NotSupportedException.h"
20#include "arcane/utils/FatalErrorException.h"
21#include "arcane/utils/NotImplementedException.h"
22#include "arcane/utils/IMemoryRessourceMng.h"
23#include "arcane/utils/MemoryView.h"
24#include "arcane/utils/OStringStream.h"
25#include "arcane/utils/ValueConvert.h"
26#include "arcane/utils/CheckedConvert.h"
27#include "arcane/utils/internal/MemoryUtilsInternal.h"
28#include "arcane/utils/internal/IMemoryRessourceMngInternal.h"
29
30#include "arcane/accelerator/core/RunQueueBuildInfo.h"
31#include "arcane/accelerator/core/Memory.h"
32#include "arcane/accelerator/core/DeviceInfoList.h"
33#include "arcane/accelerator/core/KernelLaunchArgs.h"
34
35#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
36#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
37#include "arcane/accelerator/core/internal/RunCommandImpl.h"
38#include "arcane/accelerator/core/internal/IRunQueueStream.h"
39#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
40#include "arcane/accelerator/core/PointerAttribute.h"
41#include "arcane/accelerator/core/RunQueue.h"
42#include "arcane/accelerator/core/DeviceMemoryInfo.h"
43#include "arcane/accelerator/core/NativeStream.h"
44
45#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
46
47#include <iostream>
48#include <unordered_map>
49#include <mutex>
50
51#include <cuda.h>
52
53#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
54#include <nvtx3/nvToolsExt.h>
55#endif
56
57using namespace Arccore;
58
59namespace Arcane::Accelerator::Cuda
60{
61using impl::KernelLaunchArgs;
62
63namespace
64{
65 Int32 global_cupti_flush = 0;
66 CuptiInfo global_cupti_info;
67} // namespace
68
69/*---------------------------------------------------------------------------*/
70/*---------------------------------------------------------------------------*/
71
72void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
73{
74 if (e == CUDA_SUCCESS)
75 return;
76 const char* error_name = nullptr;
77 CUresult e2 = cuGetErrorName(e, &error_name);
78 if (e2 != CUDA_SUCCESS)
79 error_name = "Unknown";
80
81 const char* error_message = nullptr;
82 CUresult e3 = cuGetErrorString(e, &error_message);
83 if (e3 != CUDA_SUCCESS)
84 error_message = "Unknown";
85
86 ARCANE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
87 ti, e, error_name, error_message);
88}
89
90/*---------------------------------------------------------------------------*/
91/*---------------------------------------------------------------------------*/
101{
102 public:
103
104 Int32 getNbThreadPerBlock(const void* kernel_ptr)
105 {
106 std::scoped_lock lock(m_mutex);
107 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
108 if (x != m_nb_thread_per_block_map.end())
109 return x->second;
110 int min_grid_size = 0;
111 int computed_block_size = 0;
112 int wanted_shared_memory = 0;
113 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
114 if (r != cudaSuccess)
115 computed_block_size = 0;
116 int num_block_0 = 0;
117 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
118 int num_block_1 = 0;
119 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
120
121 cudaFuncAttributes func_attr;
122 cudaFuncGetAttributes(&func_attr, kernel_ptr);
123 const char* func_name = nullptr;
124 cudaFuncGetName(&func_name, kernel_ptr);
125 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
126 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
127 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs
128 << " name=" << func_name << "\n";
129 return computed_block_size;
130 }
131
132 private:
133
134 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
135 std::mutex m_mutex;
136};
137
138/*---------------------------------------------------------------------------*/
139/*---------------------------------------------------------------------------*/
140
141class CudaRunQueueStream
143{
144 public:
145
146 CudaRunQueueStream(impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
147 : m_runtime(runtime)
148 {
149 if (bi.isDefault())
150 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
151 else {
152 int priority = bi.priority();
153 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
154 }
155 }
156 ~CudaRunQueueStream() override
157 {
158 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
159 }
160
161 public:
162
163 void notifyBeginLaunchKernel([[maybe_unused]] impl::RunCommandImpl& c) override
164 {
165#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
166 auto kname = c.kernelName();
167 if (kname.empty())
168 nvtxRangePush(c.traceInfo().name());
169 else
170 nvtxRangePush(kname.localstr());
171#endif
172 return m_runtime->notifyBeginLaunchKernel();
173 }
175 {
176#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
177 nvtxRangePop();
178#endif
179 return m_runtime->notifyEndLaunchKernel();
180 }
181 void barrier() override
182 {
183 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
184 if (global_cupti_flush > 0)
185 global_cupti_info.flush();
186 }
187 bool _barrierNoException() override
188 {
189 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
190 }
191 void copyMemory(const MemoryCopyArgs& args) override
192 {
193 auto source_bytes = args.source().bytes();
194 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
195 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
196 ARCANE_CHECK_CUDA(r);
197 if (!args.isAsync())
198 barrier();
199 }
200 void prefetchMemory(const MemoryPrefetchArgs& args) override
201 {
202 auto src = args.source().bytes();
203 if (src.size() == 0)
204 return;
205 DeviceId d = args.deviceId();
206 int device = cudaCpuDeviceId;
207 if (!d.isHost())
208 device = d.asInt32();
209 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
210 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
211 auto mem_location = _getMemoryLocation(device);
212#if defined(ARCANE_USING_CUDA13_OR_GREATER)
213 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
214#else
215 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
216#endif
217 ARCANE_CHECK_CUDA(r);
218 if (!args.isAsync())
219 barrier();
220 }
222 {
223 return impl::NativeStream(&m_cuda_stream);
224 }
225
226 public:
227
228 cudaStream_t trueStream() const
229 {
230 return m_cuda_stream;
231 }
232
233 private:
234
235 impl::IRunnerRuntime* m_runtime = nullptr;
236 cudaStream_t m_cuda_stream = nullptr;
237};
238
239/*---------------------------------------------------------------------------*/
240/*---------------------------------------------------------------------------*/
241
242class CudaRunQueueEvent
244{
245 public:
246
247 explicit CudaRunQueueEvent(bool has_timer)
248 {
249 if (has_timer)
250 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
251 else
252 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
253 }
254 ~CudaRunQueueEvent() override
255 {
256 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
257 }
258
259 public:
260
261 // Enregistre l'événement au sein d'une RunQueue
262 void recordQueue(impl::IRunQueueStream* stream) final
263 {
264 auto* rq = static_cast<CudaRunQueueStream*>(stream);
265 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
266 }
267
268 void wait() final
269 {
270 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
271 }
272
273 void waitForEvent(impl::IRunQueueStream* stream) final
274 {
275 auto* rq = static_cast<CudaRunQueueStream*>(stream);
276 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
277 }
278
279 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
280 {
281 // NOTE: Les évènements doivent avoir été créé avec le timer actif
282 ARCANE_CHECK_POINTER(start_event);
283 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
284 float time_in_ms = 0.0;
285
286 // TODO: regarder si nécessaire
287 // ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
288
289 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
290 double x = time_in_ms * 1.0e6;
291 Int64 nano_time = static_cast<Int64>(x);
292 return nano_time;
293 }
294
295 bool hasPendingWork() final
296 {
297 cudaError_t v = cudaEventQuery(m_cuda_event);
298 if (v == cudaErrorNotReady)
299 return true;
300 ARCANE_CHECK_CUDA(v);
301 return false;
302 }
303
304 private:
305
306 cudaEvent_t m_cuda_event;
307};
308
309/*---------------------------------------------------------------------------*/
310/*---------------------------------------------------------------------------*/
311
314{
315 public:
316
317 ~CudaRunnerRuntime() override = default;
318
319 public:
320
321 void notifyBeginLaunchKernel() override
322 {
323 ++m_nb_kernel_launched;
324 if (m_is_verbose)
325 std::cout << "BEGIN CUDA KERNEL!\n";
326 }
327 void notifyEndLaunchKernel() override
328 {
329 ARCANE_CHECK_CUDA(cudaGetLastError());
330 if (m_is_verbose)
331 std::cout << "END CUDA KERNEL!\n";
332 }
333 void barrier() override
334 {
335 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
336 }
337 eExecutionPolicy executionPolicy() const override
338 {
340 }
341 impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
342 {
343 return new CudaRunQueueStream(this, bi);
344 }
345 impl::IRunQueueEventImpl* createEventImpl() override
346 {
347 return new CudaRunQueueEvent(false);
348 }
349 impl::IRunQueueEventImpl* createEventImplWithTimer() override
350 {
351 return new CudaRunQueueEvent(true);
352 }
353 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
354 {
355 auto v = buffer.bytes();
356 const void* ptr = v.data();
357 size_t count = v.size();
358 int device = device_id.asInt32();
359 cudaMemoryAdvise cuda_advise;
360
361 if (advice == eMemoryAdvice::MostlyRead)
362 cuda_advise = cudaMemAdviseSetReadMostly;
364 cuda_advise = cudaMemAdviseSetPreferredLocation;
365 else if (advice == eMemoryAdvice::AccessedByDevice)
366 cuda_advise = cudaMemAdviseSetAccessedBy;
367 else if (advice == eMemoryAdvice::PreferredLocationHost) {
368 cuda_advise = cudaMemAdviseSetPreferredLocation;
369 device = cudaCpuDeviceId;
370 }
371 else if (advice == eMemoryAdvice::AccessedByHost) {
372 cuda_advise = cudaMemAdviseSetAccessedBy;
373 device = cudaCpuDeviceId;
374 }
375 else
376 return;
377 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
378 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
379 }
380 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
381 {
382 auto v = buffer.bytes();
383 const void* ptr = v.data();
384 size_t count = v.size();
385 int device = device_id.asInt32();
386 cudaMemoryAdvise cuda_advise;
387
388 if (advice == eMemoryAdvice::MostlyRead)
389 cuda_advise = cudaMemAdviseUnsetReadMostly;
391 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
392 else if (advice == eMemoryAdvice::AccessedByDevice)
393 cuda_advise = cudaMemAdviseUnsetAccessedBy;
394 else if (advice == eMemoryAdvice::PreferredLocationHost) {
395 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
396 device = cudaCpuDeviceId;
397 }
398 else if (advice == eMemoryAdvice::AccessedByHost) {
399 cuda_advise = cudaMemAdviseUnsetAccessedBy;
400 device = cudaCpuDeviceId;
401 }
402 else
403 return;
404 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
405 }
406
407 void setCurrentDevice(DeviceId device_id) final
408 {
409 Int32 id = device_id.asInt32();
410 if (!device_id.isAccelerator())
411 ARCANE_FATAL("Device {0} is not an accelerator device", id);
412 ARCANE_CHECK_CUDA(cudaSetDevice(id));
413 }
414
415 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
416
417 void startProfiling() override
418 {
419 global_cupti_info.start();
420 }
421
422 void stopProfiling() override
423 {
424 global_cupti_info.stop();
425 }
426
427 bool isProfilingActive() override
428 {
429 return global_cupti_info.isActive();
430 }
431
432 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
433 {
434 cudaPointerAttributes ca;
435 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
436 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
437 // que le type CUDA correspondant donc on peut faire un cast simple.
438 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
439 _fillPointerAttribute(attribute, mem_type, ca.device,
440 ptr, ca.devicePointer, ca.hostPointer);
441 }
442
443 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
444 {
445 int d = 0;
446 int wanted_d = device_id.asInt32();
447 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
448 if (d != wanted_d)
449 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
450 size_t free_mem = 0;
451 size_t total_mem = 0;
452 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
453 if (d != wanted_d)
454 ARCANE_CHECK_CUDA(cudaSetDevice(d));
456 dmi.setFreeMemory(free_mem);
457 dmi.setTotalMemory(total_mem);
458 return dmi;
459 }
460
461 void pushProfilerRange(const String& name, Int32 color_rgb) override
462 {
463#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
464 if (color_rgb >= 0) {
465 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
466 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
467 nvtxEventAttributes_t eventAttrib;
468 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
469 eventAttrib.version = NVTX_VERSION;
470 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
471 eventAttrib.colorType = NVTX_COLOR_ARGB;
472 eventAttrib.color = color_rgb;
473 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
474 eventAttrib.message.ascii = name.localstr();
475 nvtxRangePushEx(&eventAttrib);
476 }
477 else
478 nvtxRangePush(name.localstr());
479#endif
480 }
481 void popProfilerRange() override
482 {
483#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
484 nvtxRangePop();
485#endif
486 }
487
488 void finalize(ITraceMng* tm) override
489 {
490 finalizeCudaMemoryAllocators(tm);
491 }
492
493 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
494 const void* kernel_ptr,
495 Int64 total_loop_size,
496 Int32 wanted_shared_memory) override
497 {
498 if (!m_use_computed_occupancy)
499 return orig_args;
500 if (wanted_shared_memory < 0)
501 wanted_shared_memory = 0;
502 // Pour l'instant, on ne fait pas de calcul si la mémoire partagée est non nulle.
503 if (wanted_shared_memory != 0)
504 return orig_args;
505 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
506 if (computed_block_size == 0)
507 return orig_args;
508 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
509 int blocks_per_grid = CheckedConvert::toInt32(big_b);
510 return { blocks_per_grid, computed_block_size };
511 }
512
513 public:
514
515 void fillDevices(bool is_verbose);
516 void build()
517 {
518 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
519 m_use_computed_occupancy = v.value();
520 }
521
522 private:
523
524 Int64 m_nb_kernel_launched = 0;
525 bool m_is_verbose = false;
526 bool m_use_computed_occupancy = false;
527 impl::DeviceInfoList m_device_info_list;
528 OccupancyMap m_occupancy_map;
529};
530
531/*---------------------------------------------------------------------------*/
532/*---------------------------------------------------------------------------*/
533
534void CudaRunnerRuntime::
535fillDevices(bool is_verbose)
536{
537 int nb_device = 0;
538 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
539 std::ostream& omain = std::cout;
540 if (is_verbose)
541 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
542 for (int i = 0; i < nb_device; ++i) {
543 cudaDeviceProp dp;
544 cudaGetDeviceProperties(&dp, i);
545 int runtime_version = 0;
546 cudaRuntimeGetVersion(&runtime_version);
547 int driver_version = 0;
548 cudaDriverGetVersion(&driver_version);
549 OStringStream ostr;
550 std::ostream& o = ostr.stream();
551 o << "Device " << i << " name=" << dp.name << "\n";
552 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
553 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
554 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
555 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
556 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
557 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
558 o << " warpSize = " << dp.warpSize << "\n";
559 o << " memPitch = " << dp.memPitch << "\n";
560 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
561 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
562 o << " totalConstMem = " << dp.totalConstMem << "\n";
563 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
564 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
565 o << " integrated = " << dp.integrated << "\n";
566 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
567 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
568 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
569 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
570 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
571 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
572 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
573 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
574 << " " << dp.maxThreadsDim[2] << "\n";
575 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
576 << " " << dp.maxGridSize[2] << "\n";
577#if !defined(ARCANE_USING_CUDA13_OR_GREATER)
578 o << " clockRate = " << dp.clockRate << "\n";
579 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
580 o << " computeMode = " << dp.computeMode << "\n";
581 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
582#endif
583
584 {
585 int least_val = 0;
586 int greatest_val = 0;
587 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
588 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
589 }
590 {
591 CUdevice device;
592 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
593 CUuuid device_uuid;
594 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
595 o << " deviceUuid=";
596 impl::printUUID(o, device_uuid.bytes);
597 o << "\n";
598 }
599 String description(ostr.str());
600 if (is_verbose)
601 omain << description;
602
603 DeviceInfo device_info;
604 device_info.setDescription(description);
605 device_info.setDeviceId(DeviceId(i));
606 device_info.setName(dp.name);
607 device_info.setWarpSize(dp.warpSize);
608 m_device_info_list.addDevice(device_info);
609 }
610
611 Int32 global_cupti_level = 0;
612
613 // Regarde si on active Cupti
614 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
615 global_cupti_level = v.value();
616 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
617 global_cupti_flush = v.value();
618 bool do_print_cupti = true;
619 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
620 do_print_cupti = (v.value() != 0);
621
622 if (global_cupti_level > 0) {
623#ifndef ARCANE_HAS_CUDA_CUPTI
624 ARCANE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
625#endif
626 global_cupti_info.init(global_cupti_level, do_print_cupti);
627 global_cupti_info.start();
628 }
629}
630
631/*---------------------------------------------------------------------------*/
632/*---------------------------------------------------------------------------*/
633
635: public IMemoryCopier
636{
637 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryRessource from_mem,
638 MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem,
639 const RunQueue* queue) override
640 {
641 if (queue) {
642 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
643 return;
644 }
645 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
646 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
647 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
648 ARCANE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
649 }
650};
651
652/*---------------------------------------------------------------------------*/
653/*---------------------------------------------------------------------------*/
654
655} // End namespace Arcane::Accelerator::Cuda
656
657namespace
658{
660Arcane::Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
661} // namespace
662
663/*---------------------------------------------------------------------------*/
664/*---------------------------------------------------------------------------*/
665
666// Cette fonction est le point d'entrée utilisé lors du chargement
667// dynamique de cette bibliothèque
668extern "C" ARCANE_EXPORT void
669arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
670{
671 using namespace Arcane;
672 using namespace Arcane::Accelerator::Cuda;
673 global_cuda_runtime.build();
674 Arcane::Accelerator::impl::setUsingCUDARuntime(true);
675 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
676 initializeCudaMemoryAllocators();
678 MemoryUtils::setAcceleratorHostMemoryAllocator(getCudaUnifiedMemoryAllocator());
680 mrm->setIsAccelerator(true);
681 mrm->setAllocator(eMemoryRessource::UnifiedMemory, getCudaUnifiedMemoryAllocator());
682 mrm->setAllocator(eMemoryRessource::HostPinned, getCudaHostPinnedMemoryAllocator());
683 mrm->setAllocator(eMemoryRessource::Device, getCudaDeviceMemoryAllocator());
684 mrm->setCopier(&global_cuda_memory_copier);
685 global_cuda_runtime.fillDevices(init_info.isVerbose());
686}
687
688/*---------------------------------------------------------------------------*/
689/*---------------------------------------------------------------------------*/
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Definition Cupti.h:38
Map contenant l'occupation idéale pour un kernel donné.
Identifiant d'un composant du système.
Definition DeviceId.h:33
bool isHost() const
Indique si l'instance est associée à l'hôte.
Definition DeviceId.h:60
Int32 asInt32() const
Valeur numérique du device.
Definition DeviceId.h:69
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Definition DeviceId.h:66
Information sur un device.
Definition DeviceInfo.h:32
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Definition Memory.h:63
Arguments pour le préfetching mémoire.
Definition Memory.h:125
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
Definition MemoryView.h:38
constexpr SpanType bytes() const
Vue sous forme d'octets.
Definition MemoryView.h:107
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Definition MemoryView.h:110
static std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Definition Convert.cc:122
Interface pour les copies mémoire avec support des accélérateurs.
virtual IMemoryRessourceMngInternal * _internal()=0
Interface interne.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryRessource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
Definition MemoryView.h:156
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
Definition MemoryView.h:218
constexpr SpanType bytes() const
Vue sous forme d'octets.
Definition MemoryView.h:215
Flot de sortie lié à une String.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:212
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:422
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
eMemoryAdvice
Conseils pour la gestion mémoire.
Definition Memory.h:36
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
Definition Memory.h:48
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
Definition Memory.h:42
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
Definition Memory.h:40
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
Definition Memory.h:44
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
Definition Memory.h:46
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
Int32 toInt32(Int64 v)
Converti un Int64 en un Int32.
IMemoryRessourceMng * getDataMemoryResourceMng()
Gestionnaire de ressource mémoire pour les données.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Positionne la ressource mémoire utilisée pour l'allocateur mémoire des données.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')