Arcane  v4.1.0.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
KernelLauncher.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* KernelLauncher.h (C) 2000-2025 */
9/* */
10/* Gestion du lancement des noyaux de calcul sur accélérateur. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
19
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
24
25/*---------------------------------------------------------------------------*/
26/*---------------------------------------------------------------------------*/
27
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
30#else
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
32#endif
33
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
36#else
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
38#endif
39
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
42#else
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
44#endif
45
46/*---------------------------------------------------------------------------*/
47/*---------------------------------------------------------------------------*/
48
49namespace Arcane::Accelerator::impl
50{
51
52template <typename T>
53struct Privatizer
54{
55 using value_type = T;
56 using reference_type = value_type&;
57 value_type m_private_copy;
58
59 ARCCORE_HOST_DEVICE Privatizer(const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() { return m_private_copy; }
61};
62
63template <typename T>
64ARCCORE_HOST_DEVICE auto privatize(const T& item) -> Privatizer<T>
65{
66 return Privatizer<T>{item};
67}
68
69/*---------------------------------------------------------------------------*/
70/*---------------------------------------------------------------------------*/
71/*!
72 * \brief Classe pour appliquer la finalisation pour les arguments supplémentaires.
73 */
75{
76 public:
77
78 //! Applique les fonctors des arguments additionnels en début de kernel
79 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
80 applyRemainingArgsAtBegin(Int32 index, RemainingArgs&... remaining_args)
81 {
82 // Applique les réductions
83 (remaining_args._internalExecWorkItemAtBegin(index), ...);
84 }
85
86 //! Applique les fonctors des arguments additionnels en fin de kernel
87 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
88 applyRemainingArgsAtEnd(Int32 index, RemainingArgs&... remaining_args)
89 {
90 // Applique les réductions
91 (remaining_args._internalExecWorkItemAtEnd(index), ...);
92 }
93
94#if defined(ARCANE_COMPILING_SYCL)
95 //! Applique les fonctors des arguments additionnels en début de kernel
96 template <typename... RemainingArgs> static inline ARCCORE_HOST_DEVICE void
97 applyRemainingArgsAtBegin(sycl::nd_item<1> x, RemainingArgs&... remaining_args)
98 {
99 // Applique les réductions
100 (remaining_args._internalExecWorkItemAtBegin(x), ...);
101 }
102
103 //! Applique les fonctors des arguments additionnels en fin de kernel
104 template <typename... RemainingArgs> static inline ARCCORE_HOST_DEVICE void
105 applyRemainingArgsAtEnd(sycl::nd_item<1> x, RemainingArgs&... remaining_args)
106 {
107 // Applique les réductions
108 (remaining_args._internalExecWorkItemAtEnd(x), ...);
109 }
110#endif
111};
112
113/*---------------------------------------------------------------------------*/
114/*---------------------------------------------------------------------------*/
115
116#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
117
118template <typename BuilderType, typename Lambda> __global__ void
119doIndirectGPULambda(SmallSpan<const Int32> ids, Lambda func)
120{
121 using LocalIdType = BuilderType::ValueType;
122
123 auto privatizer = privatize(func);
124 auto& body = privatizer.privateCopy();
125
126 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
127 if (i < ids.size()) {
128 LocalIdType lid(ids[i]);
129 //if (i<10)
130 //printf("CUDA %d lid=%d\n",i,lid.localId());
131 body(BuilderType::create(i, lid));
132 }
133}
134
135template <typename ItemType, typename Lambda> __global__ void
136doDirectGPULambda(Int32 vsize, Lambda func)
137{
138 auto privatizer = privatize(func);
139 auto& body = privatizer.privateCopy();
140
141 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
142 if (i < vsize) {
143 //if (i<10)
144 //printf("CUDA %d lid=%d\n",i,lid.localId());
145 body(i);
146 }
147}
148
149template <typename LoopBoundType, typename Lambda> __global__ void
150doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
151{
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
154
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < bounds.nbElement()) {
157 body(bounds.getIndices(i));
158 }
159}
160
161template <typename TraitsType, typename Lambda, typename... RemainingArgs> __global__ void
162doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
163{
164 using BuilderType = TraitsType::BuilderType;
165 using LocalIdType = BuilderType::ValueType;
166
167 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
168 auto privatizer = privatize(func);
169 auto& body = privatizer.privateCopy();
170
171 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
172
174 if (i < ids.size()) {
175 LocalIdType lid(ids[i]);
176 body(BuilderType::create(i, lid), remaining_args...);
177 }
179}
180
181template <typename ItemType, typename Lambda, typename... RemainingArgs> __global__ void
182doDirectGPULambda2(Int32 vsize, Lambda func, RemainingArgs... remaining_args)
183{
184 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
185 auto privatizer = privatize(func);
186 auto& body = privatizer.privateCopy();
187
188 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
189
191 if (i < vsize) {
192 body(i, remaining_args...);
193 }
195}
196
197template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> __global__ void
198doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
199{
200 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
201 auto privatizer = privatize(func);
202 auto& body = privatizer.privateCopy();
203
204 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
205
207 if (i < bounds.nbElement()) {
208 body(bounds.getIndices(i), remaining_args...);
209 }
211}
212
213/*---------------------------------------------------------------------------*/
214/*---------------------------------------------------------------------------*/
215
216#endif // ARCANE_COMPILING_CUDA || ARCANE_COMPILING_HIP
217
218/*---------------------------------------------------------------------------*/
219/*---------------------------------------------------------------------------*/
220
221#if defined(ARCANE_COMPILING_SYCL)
222
223//! Boucle N-dimension sans indirection
224template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>
225class DoDirectSYCLLambdaArrayBounds
226{
227 public:
228
229 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args) const
230 {
231 auto privatizer = privatize(func);
232 auto& body = privatizer.privateCopy();
233
234 Int32 i = static_cast<Int32>(x.get_global_id(0));
235 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
236 if (i < bounds.nbElement()) {
237 body(bounds.getIndices(i), remaining_args...);
238 }
239 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
240 }
241 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func) const
242 {
243 auto privatizer = privatize(func);
244 auto& body = privatizer.privateCopy();
245
246 Int32 i = static_cast<Int32>(x);
247 if (i < bounds.nbElement()) {
248 body(bounds.getIndices(i));
249 }
250 }
251};
252
253//! Boucle 1D avec indirection
254template <typename TraitsType, typename Lambda, typename... RemainingArgs>
255class DoIndirectSYCLLambda
256{
257 public:
258
259 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args) const
260 {
261 using BuilderType = TraitsType::BuilderType;
262 using LocalIdType = BuilderType::ValueType;
263 auto privatizer = privatize(func);
264 auto& body = privatizer.privateCopy();
265
266 Int32 i = static_cast<Int32>(x.get_global_id(0));
267 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
268 if (i < ids.size()) {
269 LocalIdType lid(ids[i]);
270 body(BuilderType::create(i, lid), remaining_args...);
271 }
272 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
273 }
274 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func) const
275 {
276 using BuilderType = TraitsType::BuilderType;
277 using LocalIdType = BuilderType::ValueType;
278 auto privatizer = privatize(func);
279 auto& body = privatizer.privateCopy();
280
281 Int32 i = static_cast<Int32>(x);
282 if (i < ids.size()) {
283 LocalIdType lid(ids[i]);
284 body(BuilderType::create(i, lid));
285 }
286 }
287};
288
289#endif
290
291/*---------------------------------------------------------------------------*/
292/*---------------------------------------------------------------------------*/
293
294template<typename Lambda>
295void doDirectThreadLambda(Integer begin,Integer size,Lambda func)
296{
297 auto privatizer = privatize(func);
298 auto& body = privatizer.privateCopy();
299
300 for( Int32 i=0; i<size; ++i ){
301 func(begin+i);
302 }
303}
304
305/*---------------------------------------------------------------------------*/
306/*---------------------------------------------------------------------------*/
307
308// Fonction vide pour simuler un noyau invalide car non compilé avec
309// le compilateur adéquant. Ne devrait normalement pas être appelé.
310template<typename Lambda,typename... LambdaArgs>
311inline void invalidKernel(Lambda&,const LambdaArgs&...)
312{
313 ARCANE_FATAL("Invalid kernel");
314}
315
316template<typename Lambda,typename... LambdaArgs>
318{
319};
320
321/*---------------------------------------------------------------------------*/
322/*---------------------------------------------------------------------------*/
323
324#if defined(ARCANE_COMPILING_CUDA)
325template <typename... KernelArgs> inline void
326_applyKernelCUDAVariadic(bool is_cooperative, const KernelLaunchArgs& tbi,
327 cudaStream_t& s, Int32 shared_memory,
328 const void* kernel_ptr, KernelArgs... args)
329{
330 void* all_args[] = { (reinterpret_cast<void*>(&args))... };
331 if (is_cooperative)
332 cudaLaunchCooperativeKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
333 else
334 cudaLaunchKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
335}
336#endif
337
338/*---------------------------------------------------------------------------*/
339/*---------------------------------------------------------------------------*/
340/*!
341 * \brief Fonction générique pour exécuter un kernel CUDA.
342 *
343 * \param kernel noyau CUDA
344 * \param func fonction à exécuter par le noyau
345 * \param args arguments de la fonction lambda
346 *
347 * TODO: Tester si Lambda est bien une fonction, le SFINAE étant peu lisible :
348 * typename std::enable_if_t<std::is_function_v<std::decay_t<Lambda> > >* = nullptr
349 * attendons les concepts c++20 (requires)
350 */
351template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
352_applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
353 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
354{
355#if defined(ARCANE_COMPILING_CUDA)
356 Int32 shared_memory = launch_info._sharedMemorySize();
357 const void* kernel_ptr = reinterpret_cast<const void*>(kernel);
358 auto tbi = launch_info._threadBlockInfo(kernel_ptr, shared_memory);
359 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
360 bool is_cooperative = launch_info._isUseCooperativeLaunch();
361 bool use_cuda_launch = launch_info._isUseCudaLaunchKernel();
362 if (use_cuda_launch || is_cooperative)
363 _applyKernelCUDAVariadic(is_cooperative, tbi, s, shared_memory, kernel_ptr, args, func, other_args...);
364 else {
365 // TODO: utiliser cudaLaunchKernel() à la place.
366 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), shared_memory, s>>>(args, func, other_args...);
367 }
368#else
369 ARCANE_UNUSED(launch_info);
370 ARCANE_UNUSED(kernel);
371 ARCANE_UNUSED(func);
372 ARCANE_UNUSED(args);
373 ARCANE_FATAL_NO_CUDA_COMPILATION();
374#endif
375} // namespace Arcane::Accelerator::impl
376
377/*---------------------------------------------------------------------------*/
378/*---------------------------------------------------------------------------*/
379/*!
380 * \brief Fonction générique pour exécuter un kernel HIP.
381 *
382 * \param kernel noyau HIP
383 * \param func fonction à exécuter par le noyau
384 * \param args arguments de la fonction lambda
385 */
386template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
387_applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
388 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
389{
390#if defined(ARCANE_COMPILING_HIP)
391 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
392 auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
393 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
394 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
395#else
396 ARCANE_UNUSED(launch_info);
397 ARCANE_UNUSED(kernel);
398 ARCANE_UNUSED(func);
399 ARCANE_UNUSED(args);
400 ARCANE_FATAL_NO_HIP_COMPILATION();
401#endif
402}
403
404/*---------------------------------------------------------------------------*/
405/*---------------------------------------------------------------------------*/
406/*!
407 * \brief Fonction générique pour exécuter un kernel SYCL.
408 *
409 * \param kernel noyau SYCL
410 * \param func fonction à exécuter par le noyau
411 * \param args arguments de la fonction lambda
412 */
413template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
414void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
415 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... remaining_args)
416{
417#if defined(ARCANE_COMPILING_SYCL)
418 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
419 sycl::event event;
420 if constexpr (sizeof...(RemainingArgs) > 0) {
421 auto tbi = launch_info.kernelLaunchArgs();
422 Int32 b = tbi.nbBlockPerGrid();
423 Int32 t = tbi.nbThreadPerBlock();
424 sycl::nd_range<1> loop_size(b * t, t);
425 event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
426 }
427 else {
428 sycl::range<1> loop_size = launch_info.totalLoopSize();
429 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
430 }
431 launch_info._addSyclEvent(&event);
432#else
433 ARCANE_UNUSED(launch_info);
434 ARCANE_UNUSED(kernel);
435 ARCANE_UNUSED(func);
436 ARCANE_UNUSED(args);
437 ARCANE_FATAL_NO_SYCL_COMPILATION();
438#endif
439}
440
441/*---------------------------------------------------------------------------*/
442/*---------------------------------------------------------------------------*/
443
444} // End namespace Arcane::Accelerator::impl
445
446/*---------------------------------------------------------------------------*/
447/*---------------------------------------------------------------------------*/
448
449#define ARCANE_MACRO_PARENS ()
450
451// Les trois macros suivantes permettent de générer récursivement un ensemble
452// de paramètres. Si on veut supporter plus de paramètres, on peut ajouter
453// des appels à la macro suivante dans chaque macro.
454// Plus d'info ici: https://stackoverflow.com/questions/70238923/how-to-expand-a-recursive-macro-via-va-opt-in-a-nested-context
455#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
456#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
457#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
458
459#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
460 , decltype(a1)& a1 \
461 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
462
463#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
464
465/*
466 * \brief Macro pour générer les arguments de la lambda.
467 *
468 * Cette macro est interne à Arcane et ne doit pas être utilisée en dehors de Arcane.
469 *
470 * Cette macro permet de générer pour chaque argument \a arg une valeur `decltype(arg)& arg`.
471 *
472 * Par exemple:
473 * \code
474 * ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(value1,value2)
475 * // Cela génère le code suivant:
476 * , decltype(value1)&, decltype(value2)&
477 * \encode
478 */
479#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
480 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
481
482
483/*---------------------------------------------------------------------------*/
484/*---------------------------------------------------------------------------*/
485
486#endif
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgsAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
static ARCCORE_DEVICE void applyRemainingArgsAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
friend void _applyKernelCUDA(impl::RunCommandLaunchInfo &launch_info, const CudaKernel &kernel, Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel CUDA.
friend void _applyKernelHIP(impl::RunCommandLaunchInfo &launch_info, const HipKernel &kernel, const Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel HIP.
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.