Arcane  v3.16.8.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaAcceleratorRuntime.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaAcceleratorRuntime.cc (C) 2000-2025 */
9/* */
10/* Runtime pour 'Cuda'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/accelerator/cuda/CudaAccelerator.h"
15
16#include "arcane/utils/PlatformUtils.h"
17#include "arcane/utils/Array.h"
18#include "arcane/utils/TraceInfo.h"
19#include "arcane/utils/NotSupportedException.h"
20#include "arcane/utils/FatalErrorException.h"
21#include "arcane/utils/NotImplementedException.h"
22#include "arcane/utils/IMemoryRessourceMng.h"
23#include "arcane/utils/MemoryView.h"
24#include "arcane/utils/OStringStream.h"
25#include "arcane/utils/ValueConvert.h"
26#include "arcane/utils/CheckedConvert.h"
27#include "arcane/utils/internal/IMemoryRessourceMngInternal.h"
28
29#include "arcane/accelerator/core/RunQueueBuildInfo.h"
30#include "arcane/accelerator/core/Memory.h"
31#include "arcane/accelerator/core/DeviceInfoList.h"
32#include "arcane/accelerator/core/KernelLaunchArgs.h"
33
34#include "arcane/accelerator/core/internal/IRunnerRuntime.h"
35#include "arcane/accelerator/core/internal/RegisterRuntimeInfo.h"
36#include "arcane/accelerator/core/internal/RunCommandImpl.h"
37#include "arcane/accelerator/core/internal/IRunQueueStream.h"
38#include "arcane/accelerator/core/internal/IRunQueueEventImpl.h"
39#include "arcane/accelerator/core/PointerAttribute.h"
40#include "arcane/accelerator/core/RunQueue.h"
41#include "arcane/accelerator/core/DeviceMemoryInfo.h"
42#include "arcane/accelerator/core/NativeStream.h"
43
44#include "arcane/accelerator/cuda/runtime/internal/Cupti.h"
45
46#include <iostream>
47#include <unordered_map>
48#include <mutex>
49
50#include <cuda.h>
51
52#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
53#include <nvtx3/nvToolsExt.h>
54#endif
55
56using namespace Arccore;
57
58namespace Arcane::Accelerator::Cuda
59{
60using impl::KernelLaunchArgs;
61
62namespace
63{
64 Int32 global_cupti_flush = 0;
65 CuptiInfo global_cupti_info;
66} // namespace
67
68/*---------------------------------------------------------------------------*/
69/*---------------------------------------------------------------------------*/
70
71void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)
72{
73 if (e == CUDA_SUCCESS)
74 return;
75 const char* error_name = nullptr;
76 CUresult e2 = cuGetErrorName(e, &error_name);
77 if (e2 != CUDA_SUCCESS)
78 error_name = "Unknown";
79
80 const char* error_message = nullptr;
81 CUresult e3 = cuGetErrorString(e, &error_message);
82 if (e3 != CUDA_SUCCESS)
83 error_message = "Unknown";
84
85 ARCANE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",
86 ti, e, error_name, error_message);
87}
88
89/*---------------------------------------------------------------------------*/
90/*---------------------------------------------------------------------------*/
100{
101 public:
102
103 Int32 getNbThreadPerBlock(const void* kernel_ptr)
104 {
105 std::scoped_lock lock(m_mutex);
106 auto x = m_nb_thread_per_block_map.find(kernel_ptr);
107 if (x != m_nb_thread_per_block_map.end())
108 return x->second;
109 int min_grid_size = 0;
110 int computed_block_size = 0;
111 int wanted_shared_memory = 0;
112 cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);
113 if (r != cudaSuccess)
114 computed_block_size = 0;
115 int num_block_0 = 0;
116 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);
117 int num_block_1 = 0;
118 cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);
119
120 cudaFuncAttributes func_attr;
121 cudaFuncGetAttributes(&func_attr, kernel_ptr);
122 const char* func_name = nullptr;
123 cudaFuncGetName(&func_name, kernel_ptr);
124 m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;
125 std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1
126 << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs
127 << " name=" << func_name << "\n";
128 return computed_block_size;
129 }
130
131 private:
132
133 std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;
134 std::mutex m_mutex;
135};
136
137/*---------------------------------------------------------------------------*/
138/*---------------------------------------------------------------------------*/
139
140class CudaRunQueueStream
142{
143 public:
144
145 CudaRunQueueStream(impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)
146 : m_runtime(runtime)
147 {
148 if (bi.isDefault())
149 ARCANE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));
150 else {
151 int priority = bi.priority();
152 ARCANE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));
153 }
154 }
155 ~CudaRunQueueStream() override
156 {
157 ARCANE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));
158 }
159
160 public:
161
162 void notifyBeginLaunchKernel([[maybe_unused]] impl::RunCommandImpl& c) override
163 {
164#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
165 auto kname = c.kernelName();
166 if (kname.empty())
167 nvtxRangePush(c.traceInfo().name());
168 else
169 nvtxRangePush(kname.localstr());
170#endif
171 return m_runtime->notifyBeginLaunchKernel();
172 }
174 {
175#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
176 nvtxRangePop();
177#endif
178 return m_runtime->notifyEndLaunchKernel();
179 }
180 void barrier() override
181 {
182 ARCANE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));
183 if (global_cupti_flush > 0)
184 global_cupti_info.flush();
185 }
186 bool _barrierNoException() override
187 {
188 return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);
189 }
190 void copyMemory(const MemoryCopyArgs& args) override
191 {
192 auto source_bytes = args.source().bytes();
193 auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),
194 source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);
195 ARCANE_CHECK_CUDA(r);
196 if (!args.isAsync())
197 barrier();
198 }
199 void prefetchMemory(const MemoryPrefetchArgs& args) override
200 {
201 auto src = args.source().bytes();
202 if (src.size() == 0)
203 return;
204 DeviceId d = args.deviceId();
205 int device = cudaCpuDeviceId;
206 if (!d.isHost())
207 device = d.asInt32();
208 //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId
209 // << " size=" << args.source().size() << " data=" << src.data() << "\n";
210 auto mem_location = _getMemoryLocation(device);
211#if defined(ARCANE_USING_CUDA13_OR_GREATER)
212 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);
213#else
214 auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);
215#endif
216 ARCANE_CHECK_CUDA(r);
217 if (!args.isAsync())
218 barrier();
219 }
221 {
222 return impl::NativeStream(&m_cuda_stream);
223 }
224
225 public:
226
227 cudaStream_t trueStream() const
228 {
229 return m_cuda_stream;
230 }
231
232 private:
233
234 impl::IRunnerRuntime* m_runtime = nullptr;
235 cudaStream_t m_cuda_stream = nullptr;
236};
237
238/*---------------------------------------------------------------------------*/
239/*---------------------------------------------------------------------------*/
240
241class CudaRunQueueEvent
243{
244 public:
245
246 explicit CudaRunQueueEvent(bool has_timer)
247 {
248 if (has_timer)
249 ARCANE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));
250 else
251 ARCANE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));
252 }
253 ~CudaRunQueueEvent() override
254 {
255 ARCANE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));
256 }
257
258 public:
259
260 // Enregistre l'événement au sein d'une RunQueue
261 void recordQueue(impl::IRunQueueStream* stream) final
262 {
263 auto* rq = static_cast<CudaRunQueueStream*>(stream);
264 ARCANE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));
265 }
266
267 void wait() final
268 {
269 ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
270 }
271
272 void waitForEvent(impl::IRunQueueStream* stream) final
273 {
274 auto* rq = static_cast<CudaRunQueueStream*>(stream);
275 ARCANE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));
276 }
277
278 Int64 elapsedTime(IRunQueueEventImpl* start_event) final
279 {
280 // NOTE: Les évènements doivent avoir été créé avec le timer actif
281 ARCANE_CHECK_POINTER(start_event);
282 auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);
283 float time_in_ms = 0.0;
284
285 // TODO: regarder si nécessaire
286 // ARCANE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));
287
288 ARCANE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));
289 double x = time_in_ms * 1.0e6;
290 Int64 nano_time = static_cast<Int64>(x);
291 return nano_time;
292 }
293
294 bool hasPendingWork() final
295 {
296 cudaError_t v = cudaEventQuery(m_cuda_event);
297 if (v == cudaErrorNotReady)
298 return true;
299 ARCANE_CHECK_CUDA(v);
300 return false;
301 }
302
303 private:
304
305 cudaEvent_t m_cuda_event;
306};
307
308/*---------------------------------------------------------------------------*/
309/*---------------------------------------------------------------------------*/
310
313{
314 public:
315
316 ~CudaRunnerRuntime() override = default;
317
318 public:
319
320 void notifyBeginLaunchKernel() override
321 {
322 ++m_nb_kernel_launched;
323 if (m_is_verbose)
324 std::cout << "BEGIN CUDA KERNEL!\n";
325 }
326 void notifyEndLaunchKernel() override
327 {
328 ARCANE_CHECK_CUDA(cudaGetLastError());
329 if (m_is_verbose)
330 std::cout << "END CUDA KERNEL!\n";
331 }
332 void barrier() override
333 {
334 ARCANE_CHECK_CUDA(cudaDeviceSynchronize());
335 }
336 eExecutionPolicy executionPolicy() const override
337 {
339 }
340 impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override
341 {
342 return new CudaRunQueueStream(this, bi);
343 }
344 impl::IRunQueueEventImpl* createEventImpl() override
345 {
346 return new CudaRunQueueEvent(false);
347 }
348 impl::IRunQueueEventImpl* createEventImplWithTimer() override
349 {
350 return new CudaRunQueueEvent(true);
351 }
352 void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
353 {
354 auto v = buffer.bytes();
355 const void* ptr = v.data();
356 size_t count = v.size();
357 int device = device_id.asInt32();
358 cudaMemoryAdvise cuda_advise;
359
360 if (advice == eMemoryAdvice::MostlyRead)
361 cuda_advise = cudaMemAdviseSetReadMostly;
363 cuda_advise = cudaMemAdviseSetPreferredLocation;
364 else if (advice == eMemoryAdvice::AccessedByDevice)
365 cuda_advise = cudaMemAdviseSetAccessedBy;
366 else if (advice == eMemoryAdvice::PreferredLocationHost) {
367 cuda_advise = cudaMemAdviseSetPreferredLocation;
368 device = cudaCpuDeviceId;
369 }
370 else if (advice == eMemoryAdvice::AccessedByHost) {
371 cuda_advise = cudaMemAdviseSetAccessedBy;
372 device = cudaCpuDeviceId;
373 }
374 else
375 return;
376 //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";
377 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
378 }
379 void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override
380 {
381 auto v = buffer.bytes();
382 const void* ptr = v.data();
383 size_t count = v.size();
384 int device = device_id.asInt32();
385 cudaMemoryAdvise cuda_advise;
386
387 if (advice == eMemoryAdvice::MostlyRead)
388 cuda_advise = cudaMemAdviseUnsetReadMostly;
390 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
391 else if (advice == eMemoryAdvice::AccessedByDevice)
392 cuda_advise = cudaMemAdviseUnsetAccessedBy;
393 else if (advice == eMemoryAdvice::PreferredLocationHost) {
394 cuda_advise = cudaMemAdviseUnsetPreferredLocation;
395 device = cudaCpuDeviceId;
396 }
397 else if (advice == eMemoryAdvice::AccessedByHost) {
398 cuda_advise = cudaMemAdviseUnsetAccessedBy;
399 device = cudaCpuDeviceId;
400 }
401 else
402 return;
403 ARCANE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));
404 }
405
406 void setCurrentDevice(DeviceId device_id) final
407 {
408 Int32 id = device_id.asInt32();
409 if (!device_id.isAccelerator())
410 ARCANE_FATAL("Device {0} is not an accelerator device", id);
411 ARCANE_CHECK_CUDA(cudaSetDevice(id));
412 }
413
414 const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }
415
416 void startProfiling() override
417 {
418 global_cupti_info.start();
419 }
420
421 void stopProfiling() override
422 {
423 global_cupti_info.stop();
424 }
425
426 bool isProfilingActive() override
427 {
428 return global_cupti_info.isActive();
429 }
430
431 void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override
432 {
433 cudaPointerAttributes ca;
434 ARCANE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));
435 // NOTE: le type Arcane 'ePointerMemoryType' a normalememt les mêmes valeurs
436 // que le type CUDA correspondant donc on peut faire un cast simple.
437 auto mem_type = static_cast<ePointerMemoryType>(ca.type);
438 _fillPointerAttribute(attribute, mem_type, ca.device,
439 ptr, ca.devicePointer, ca.hostPointer);
440 }
441
442 DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override
443 {
444 int d = 0;
445 int wanted_d = device_id.asInt32();
446 ARCANE_CHECK_CUDA(cudaGetDevice(&d));
447 if (d != wanted_d)
448 ARCANE_CHECK_CUDA(cudaSetDevice(wanted_d));
449 size_t free_mem = 0;
450 size_t total_mem = 0;
451 ARCANE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));
452 if (d != wanted_d)
453 ARCANE_CHECK_CUDA(cudaSetDevice(d));
455 dmi.setFreeMemory(free_mem);
456 dmi.setTotalMemory(total_mem);
457 return dmi;
458 }
459
460 void pushProfilerRange(const String& name, Int32 color_rgb) override
461 {
462#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
463 if (color_rgb >= 0) {
464 // NOTE: Il faudrait faire: nvtxEventAttributes_t eventAttrib = { 0 };
465 // mais cela provoque pleins d'avertissement de type 'missing initializer for member'
466 nvtxEventAttributes_t eventAttrib;
467 std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));
468 eventAttrib.version = NVTX_VERSION;
469 eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;
470 eventAttrib.colorType = NVTX_COLOR_ARGB;
471 eventAttrib.color = color_rgb;
472 eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;
473 eventAttrib.message.ascii = name.localstr();
474 nvtxRangePushEx(&eventAttrib);
475 }
476 else
477 nvtxRangePush(name.localstr());
478#endif
479 }
480 void popProfilerRange() override
481 {
482#ifdef ARCANE_HAS_CUDA_NVTOOLSEXT
483 nvtxRangePop();
484#endif
485 }
486
487 void finalize(ITraceMng* tm) override
488 {
489 finalizeCudaMemoryAllocators(tm);
490 }
491
492 KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,
493 const void* kernel_ptr,
494 Int64 total_loop_size,
495 Int32 wanted_shared_memory) override
496 {
497 if (!m_use_computed_occupancy)
498 return orig_args;
499 if (wanted_shared_memory < 0)
500 wanted_shared_memory = 0;
501 // Pour l'instant, on ne fait pas de calcul si la mémoire partagée est non nulle.
502 if (wanted_shared_memory != 0)
503 return orig_args;
504 Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);
505 if (computed_block_size == 0)
506 return orig_args;
507 Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;
508 int blocks_per_grid = CheckedConvert::toInt32(big_b);
509 return { blocks_per_grid, computed_block_size };
510 }
511
512 public:
513
514 void fillDevices(bool is_verbose);
515 void build()
516 {
517 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))
518 m_use_computed_occupancy = v.value();
519 }
520
521 private:
522
523 Int64 m_nb_kernel_launched = 0;
524 bool m_is_verbose = false;
525 bool m_use_computed_occupancy = false;
526 impl::DeviceInfoList m_device_info_list;
527 OccupancyMap m_occupancy_map;
528};
529
530/*---------------------------------------------------------------------------*/
531/*---------------------------------------------------------------------------*/
532
533void CudaRunnerRuntime::
534fillDevices(bool is_verbose)
535{
536 int nb_device = 0;
537 ARCANE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));
538 std::ostream& omain = std::cout;
539 if (is_verbose)
540 omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";
541 for (int i = 0; i < nb_device; ++i) {
542 cudaDeviceProp dp;
543 cudaGetDeviceProperties(&dp, i);
544 int runtime_version = 0;
545 cudaRuntimeGetVersion(&runtime_version);
546 int driver_version = 0;
547 cudaDriverGetVersion(&driver_version);
548 OStringStream ostr;
549 std::ostream& o = ostr.stream();
550 o << "Device " << i << " name=" << dp.name << "\n";
551 o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";
552 o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";
553 o << " computeCapability = " << dp.major << "." << dp.minor << "\n";
554 o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";
555 o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";
556 o << " regsPerBlock = " << dp.regsPerBlock << "\n";
557 o << " warpSize = " << dp.warpSize << "\n";
558 o << " memPitch = " << dp.memPitch << "\n";
559 o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";
560 o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";
561 o << " totalConstMem = " << dp.totalConstMem << "\n";
562 o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";
563 o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";
564 o << " integrated = " << dp.integrated << "\n";
565 o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";
566 o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";
567 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
568 o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";
569 o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";
570 o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";
571 o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";
572 o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]
573 << " " << dp.maxThreadsDim[2] << "\n";
574 o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]
575 << " " << dp.maxGridSize[2] << "\n";
576#if !defined(ARCANE_USING_CUDA13_OR_GREATER)
577 o << " clockRate = " << dp.clockRate << "\n";
578 o << " deviceOverlap = " << dp.deviceOverlap << "\n";
579 o << " computeMode = " << dp.computeMode << "\n";
580 o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";
581#endif
582
583 {
584 int least_val = 0;
585 int greatest_val = 0;
586 ARCANE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));
587 o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";
588 }
589 {
590 CUdevice device;
591 ARCANE_CHECK_CUDA(cuDeviceGet(&device, i));
592 CUuuid device_uuid;
593 ARCANE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));
594 o << " deviceUuid=";
595 impl::printUUID(o, device_uuid.bytes);
596 o << "\n";
597 }
598 String description(ostr.str());
599 if (is_verbose)
600 omain << description;
601
602 DeviceInfo device_info;
603 device_info.setDescription(description);
604 device_info.setDeviceId(DeviceId(i));
605 device_info.setName(dp.name);
606 m_device_info_list.addDevice(device_info);
607 }
608
609 Int32 global_cupti_level = 0;
610
611 // Regarde si on active Cupti
612 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))
613 global_cupti_level = v.value();
614 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))
615 global_cupti_flush = v.value();
616 bool do_print_cupti = true;
617 if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))
618 do_print_cupti = (v.value() != 0);
619
620 if (global_cupti_level > 0) {
621#ifndef ARCANE_HAS_CUDA_CUPTI
622 ARCANE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");
623#endif
624 global_cupti_info.init(global_cupti_level, do_print_cupti);
625 global_cupti_info.start();
626 }
627}
628
629/*---------------------------------------------------------------------------*/
630/*---------------------------------------------------------------------------*/
631
633: public IMemoryCopier
634{
635 void copy(ConstMemoryView from, [[maybe_unused]] eMemoryRessource from_mem,
636 MutableMemoryView to, [[maybe_unused]] eMemoryRessource to_mem,
637 const RunQueue* queue) override
638 {
639 if (queue) {
640 queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));
641 return;
642 }
643 // 'cudaMemcpyDefault' sait automatiquement ce qu'il faut faire en tenant
644 // uniquement compte de la valeur des pointeurs. Il faudrait voir si
645 // utiliser \a from_mem et \a to_mem peut améliorer les performances.
646 ARCANE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));
647 }
648};
649
650/*---------------------------------------------------------------------------*/
651/*---------------------------------------------------------------------------*/
652
653} // End namespace Arcane::Accelerator::Cuda
654
655namespace
656{
658Arcane::Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;
659} // namespace
660
661/*---------------------------------------------------------------------------*/
662/*---------------------------------------------------------------------------*/
663
664// Cette fonction est le point d'entrée utilisé lors du chargement
665// dynamique de cette bibliothèque
666extern "C" ARCANE_EXPORT void
667arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)
668{
669 using namespace Arcane;
670 using namespace Arcane::Accelerator::Cuda;
671 global_cuda_runtime.build();
672 Arcane::Accelerator::impl::setUsingCUDARuntime(true);
673 Arcane::Accelerator::impl::setCUDARunQueueRuntime(&global_cuda_runtime);
674 initializeCudaMemoryAllocators();
677 mrm->setIsAccelerator(true);
678 mrm->setAllocator(eMemoryRessource::UnifiedMemory, getCudaUnifiedMemoryAllocator());
679 mrm->setAllocator(eMemoryRessource::HostPinned, getCudaHostPinnedMemoryAllocator());
680 mrm->setAllocator(eMemoryRessource::Device, getCudaDeviceMemoryAllocator());
681 mrm->setCopier(&global_cuda_memory_copier);
682 global_cuda_runtime.fillDevices(init_info.isVerbose());
683}
684
685/*---------------------------------------------------------------------------*/
686/*---------------------------------------------------------------------------*/
#define ARCANE_CHECK_POINTER(ptr)
Macro retournant le pointeur ptr s'il est non nul ou lancant une exception s'il est nul.
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
void copy(ConstMemoryView from, eMemoryRessource from_mem, MutableMemoryView to, eMemoryRessource to_mem, const RunQueue *queue) override
Copie les données de from vers to avec la queue queue.
void notifyBeginLaunchKernel(impl::RunCommandImpl &c) override
Notification avant le lancement de la commande.
void barrier() override
Bloque jusqu'à ce que toutes les actions associées à cette file soient terminées.
bool _barrierNoException() override
Barrière sans exception. Retourne true en cas d'erreur.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Effectue un pré-chargement d'une zone mémoire.
void notifyEndLaunchKernel(impl::RunCommandImpl &) override
Notification de fin de lancement de la commande.
impl::NativeStream nativeStream() override
Pointeur sur la structure interne dépendante de l'implémentation.
void copyMemory(const MemoryCopyArgs &args) override
Effectue une copie entre deux zones mémoire.
Classe singleton pour gérer CUPTI.
Definition Cupti.h:38
Map contenant l'occupation idéale pour un kernel donné.
Identifiant d'un composant du système.
Definition DeviceId.h:33
bool isHost() const
Indique si l'instance est associée à l'hôte.
Definition DeviceId.h:60
Int32 asInt32() const
Valeur numérique du device.
Definition DeviceId.h:69
bool isAccelerator() const
Indique si l'instance est associée à un accélérateur.
Definition DeviceId.h:66
Information sur un device.
Definition DeviceInfo.h:32
Information mémoire d'un accélérateur.
Interface d'une liste de devices.
Arguments pour la copie mémoire.
Definition Memory.h:63
Arguments pour le préfetching mémoire.
Definition Memory.h:125
Informations sur une adresse mémoire.
Informations pour initialiser le runtime accélérateur.
Informations pour créer une RunQueue.
bool isDefault() const
Indique si l'instance a uniquement les valeurs par défaut.
File d'exécution pour un accélérateur.
bool isAsync() const
Indique si la file d'exécution est asynchrone.
Definition RunQueue.cc:320
void copyMemory(const MemoryCopyArgs &args) const
Copie des informations entre deux zones mémoires.
Definition RunQueue.cc:237
Interface d'une liste de devices.
Interface de l'implémentation d'un évènement.
Interface d'un flux d'exécution pour une RunQueue.
Interface du runtime associé à un accélérateur.
Arguments pour lancer un kernel.
Type opaque pour encapsuler une 'stream' native.
Implémentation d'une commande pour accélérateur.
Vue constante sur une zone mémoire contigue contenant des éléments de taille fixe.
Definition MemoryView.h:38
constexpr SpanType bytes() const
Vue sous forme d'octets.
Definition MemoryView.h:107
constexpr const std::byte * data() const
Pointeur sur la zone mémoire.
Definition MemoryView.h:110
static std::optional< Int32 > tryParseFromEnvironment(StringView s, bool throw_if_invalid)
Definition Convert.cc:122
Interface pour les copies mémoire avec support des accélérateurs.
virtual IMemoryRessourceMngInternal * _internal()=0
Interface interne.
Partie interne à Arcane de 'IMemoryRessourceMng'.
virtual void setAllocator(eMemoryRessource r, IMemoryAllocator *allocator)=0
Positionne l'allocateur pour la ressource r.
virtual void setCopier(IMemoryCopier *copier)=0
Positionne l'instance gérant les copies.
virtual void setIsAccelerator(bool v)=0
Indique si un accélérateur est disponible.
Interface du gestionnaire de traces.
Vue modifiable sur une zone mémoire contigue contenant des éléments de taille fixe.
Definition MemoryView.h:156
constexpr std::byte * data() const
Pointeur sur la zone mémoire.
Definition MemoryView.h:218
constexpr SpanType bytes() const
Vue sous forme d'octets.
Definition MemoryView.h:215
Flot de sortie lié à une String.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:212
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
Definition Span.h:422
Chaîne de caractères unicode.
const char * localstr() const
Retourne la conversion de l'instance dans l'encodage UTF-8.
Definition String.cc:228
eMemoryAdvice
Conseils pour la gestion mémoire.
Definition Memory.h:36
@ AccessedByHost
Indique que la zone mémoire est accédée par l'hôte.
Definition Memory.h:48
@ PreferredLocationDevice
Privilégié le positionnement de la mémoire sur l'accélérateur.
Definition Memory.h:42
@ MostlyRead
Indique que la zone mémoire est principalement en lecture seule.
Definition Memory.h:40
@ PreferredLocationHost
Privilégié le positionnement de la mémoire sur l'hôte.
Definition Memory.h:44
@ AccessedByDevice
Indique que la zone mémoire est accédée par l'accélérateur.
Definition Memory.h:46
ePointerMemoryType
Type de mémoire pour un pointeur.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
Int32 toInt32(Int64 v)
Converti un Int64 en un Int32.
IMemoryRessourceMng * getDataMemoryRessourceMng()
Gestionnaire de ressource mémoire pour les données.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Positionne l'allocateur spécifique pour les accélérateurs.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ HostPinned
Alloue sur l'hôte.
@ UnifiedMemory
Alloue en utilisant la mémoire unifiée.
@ Device
Alloue sur le device.
Espace de nom de Arccore.
Arcane::eMemoryResource eMemoryRessource
Typedef pour la version Arcane historique (avec 2's')