Arcane  v4.1.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
KernelLauncher.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* KernelLauncher.h (C) 2000-2025 */
9/* */
10/* Gestion du lancement des noyaux de calcul sur accélérateur. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
19
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
24
25/*---------------------------------------------------------------------------*/
26/*---------------------------------------------------------------------------*/
27
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
30#else
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
32#endif
33
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
36#else
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
38#endif
39
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
42#else
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
44#endif
45
46/*---------------------------------------------------------------------------*/
47/*---------------------------------------------------------------------------*/
48
49namespace Arcane::Accelerator::impl
50{
51
52template <typename T>
53struct Privatizer
54{
55 using value_type = T;
56 using reference_type = value_type&;
57 value_type m_private_copy;
58
59 ARCCORE_HOST_DEVICE Privatizer(const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() { return m_private_copy; }
61};
62
63template <typename T>
64ARCCORE_HOST_DEVICE auto privatize(const T& item) -> Privatizer<T>
65{
66 return Privatizer<T>{item};
67}
68
69/*---------------------------------------------------------------------------*/
70/*---------------------------------------------------------------------------*/
75{
76 public:
77
79 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
80 applyRemainingArgsAtBegin(Int32 index, RemainingArgs&... remaining_args)
81 {
82 // Applique les réductions
83 (remaining_args._internalExecWorkItemAtBegin(index), ...);
84 }
85
87 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
88 applyRemainingArgsAtEnd(Int32 index, RemainingArgs&... remaining_args)
89 {
90 // Applique les réductions
91 (remaining_args._internalExecWorkItemAtEnd(index), ...);
92 }
93
94#if defined(ARCANE_COMPILING_SYCL)
96 template <typename... RemainingArgs> static inline ARCCORE_HOST_DEVICE void
97 applyRemainingArgsAtBegin(sycl::nd_item<1> x, RemainingArgs&... remaining_args)
98 {
99 // Applique les réductions
100 (remaining_args._internalExecWorkItemAtBegin(x), ...);
101 }
102
104 template <typename... RemainingArgs> static inline ARCCORE_HOST_DEVICE void
105 applyRemainingArgsAtEnd(sycl::nd_item<1> x, RemainingArgs&... remaining_args)
106 {
107 // Applique les réductions
108 (remaining_args._internalExecWorkItemAtEnd(x), ...);
109 }
110#endif
111};
112
113/*---------------------------------------------------------------------------*/
114/*---------------------------------------------------------------------------*/
115
116#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
117
118template <typename BuilderType, typename Lambda> __global__ void
119doIndirectGPULambda(SmallSpan<const Int32> ids, Lambda func)
120{
121 using LocalIdType = BuilderType::ValueType;
122
123 auto privatizer = privatize(func);
124 auto& body = privatizer.privateCopy();
125
126 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
127 if (i < ids.size()) {
128 LocalIdType lid(ids[i]);
129 //if (i<10)
130 //printf("CUDA %d lid=%d\n",i,lid.localId());
131 body(BuilderType::create(i, lid));
132 }
133}
134
135template <typename ItemType, typename Lambda> __global__ void
136doDirectGPULambda(Int32 vsize, Lambda func)
137{
138 auto privatizer = privatize(func);
139 auto& body = privatizer.privateCopy();
140
141 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
142 if (i < vsize) {
143 //if (i<10)
144 //printf("CUDA %d lid=%d\n",i,lid.localId());
145 body(i);
146 }
147}
148
149template <typename LoopBoundType, typename Lambda> __global__ void
150doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
151{
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
154
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < bounds.nbElement()) {
157 body(bounds.getIndices(i));
158 }
159}
160
161template <typename TraitsType, typename Lambda, typename... RemainingArgs> __global__ void
162doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
163{
164 using BuilderType = TraitsType::BuilderType;
165 using LocalIdType = BuilderType::ValueType;
166
167 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
168 auto privatizer = privatize(func);
169 auto& body = privatizer.privateCopy();
170
171 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
172
174 if (i < ids.size()) {
175 LocalIdType lid(ids[i]);
176 body(BuilderType::create(i, lid), remaining_args...);
177 }
179}
180
181template <typename ItemType, typename Lambda, typename... RemainingArgs> __global__ void
182doDirectGPULambda2(Int32 vsize, Lambda func, RemainingArgs... remaining_args)
183{
184 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
185 auto privatizer = privatize(func);
186 auto& body = privatizer.privateCopy();
187
188 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
189
191 if (i < vsize) {
192 body(i, remaining_args...);
193 }
195}
196
197template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> __global__ void
198doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
199{
200 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
201 auto privatizer = privatize(func);
202 auto& body = privatizer.privateCopy();
203
204 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
205
207 if (i < bounds.nbElement()) {
208 body(bounds.getIndices(i), remaining_args...);
209 }
211}
212
213/*---------------------------------------------------------------------------*/
214/*---------------------------------------------------------------------------*/
215
216#endif // ARCANE_COMPILING_CUDA || ARCANE_COMPILING_HIP
217
218/*---------------------------------------------------------------------------*/
219/*---------------------------------------------------------------------------*/
220
221#if defined(ARCANE_COMPILING_SYCL)
222
224template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>
225class DoDirectSYCLLambdaArrayBounds
226{
227 public:
228
229 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args) const
230 {
231 auto privatizer = privatize(func);
232 auto& body = privatizer.privateCopy();
233
234 Int32 i = static_cast<Int32>(x.get_global_id(0));
235 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
236 if (i < bounds.nbElement()) {
237 body(bounds.getIndices(i), remaining_args...);
238 }
239 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
240 }
241 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func) const
242 {
243 auto privatizer = privatize(func);
244 auto& body = privatizer.privateCopy();
245
246 Int32 i = static_cast<Int32>(x);
247 if (i < bounds.nbElement()) {
248 body(bounds.getIndices(i));
249 }
250 }
251};
252
254template <typename TraitsType, typename Lambda, typename... RemainingArgs>
255class DoIndirectSYCLLambda
256{
257 public:
258
259 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args) const
260 {
261 using BuilderType = TraitsType::BuilderType;
262 using LocalIdType = BuilderType::ValueType;
263 auto privatizer = privatize(func);
264 auto& body = privatizer.privateCopy();
265
266 Int32 i = static_cast<Int32>(x.get_global_id(0));
267 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
268 if (i < ids.size()) {
269 LocalIdType lid(ids[i]);
270 body(BuilderType::create(i, lid), remaining_args...);
271 }
272 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
273 }
274 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func) const
275 {
276 using BuilderType = TraitsType::BuilderType;
277 using LocalIdType = BuilderType::ValueType;
278 auto privatizer = privatize(func);
279 auto& body = privatizer.privateCopy();
280
281 Int32 i = static_cast<Int32>(x);
282 if (i < ids.size()) {
283 LocalIdType lid(ids[i]);
284 body(BuilderType::create(i, lid));
285 }
286 }
287};
288
289#endif
290
291/*---------------------------------------------------------------------------*/
292/*---------------------------------------------------------------------------*/
293
294template<typename Lambda>
295void doDirectThreadLambda(Integer begin,Integer size,Lambda func)
296{
297 auto privatizer = privatize(func);
298 auto& body = privatizer.privateCopy();
299
300 for( Int32 i=0; i<size; ++i ){
301 func(begin+i);
302 }
303}
304
305/*---------------------------------------------------------------------------*/
306/*---------------------------------------------------------------------------*/
307
308// Fonction vide pour simuler un noyau invalide car non compilé avec
309// le compilateur adéquant. Ne devrait normalement pas être appelé.
310template<typename Lambda,typename... LambdaArgs>
311inline void invalidKernel(Lambda&,const LambdaArgs&...)
312{
313 ARCANE_FATAL("Invalid kernel");
314}
315
316template<typename Lambda,typename... LambdaArgs>
318{
319};
320
321/*---------------------------------------------------------------------------*/
322/*---------------------------------------------------------------------------*/
323
324#if defined(ARCANE_COMPILING_CUDA)
325template <typename... KernelArgs> inline void
326_applyKernelCUDAVariadic(bool is_cooperative, const KernelLaunchArgs& tbi,
327 cudaStream_t& s, Int32 shared_memory,
328 const void* kernel_ptr, KernelArgs... args)
329{
330 void* all_args[] = { (reinterpret_cast<void*>(&args))... };
331 if (is_cooperative)
332 cudaLaunchCooperativeKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
333 else
334 cudaLaunchKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
335}
336#endif
337
338/*---------------------------------------------------------------------------*/
339/*---------------------------------------------------------------------------*/
351template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
352_applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
353 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
354{
355#if defined(ARCANE_COMPILING_CUDA)
356 Int32 shared_memory = launch_info._sharedMemorySize();
357 const void* kernel_ptr = reinterpret_cast<const void*>(kernel);
358 auto tbi = launch_info._threadBlockInfo(kernel_ptr, shared_memory);
359 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
360 bool is_cooperative = launch_info._isUseCooperativeLaunch();
361 bool use_cuda_launch = launch_info._isUseCudaLaunchKernel();
362 if (use_cuda_launch || is_cooperative)
363 _applyKernelCUDAVariadic(is_cooperative, tbi, s, shared_memory, kernel_ptr, args, func, other_args...);
364 else {
365 // TODO: utiliser cudaLaunchKernel() à la place.
366 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), shared_memory, s>>>(args, func, other_args...);
367 }
368#else
369 ARCANE_UNUSED(launch_info);
370 ARCANE_UNUSED(kernel);
371 ARCANE_UNUSED(func);
372 ARCANE_UNUSED(args);
373 ARCANE_FATAL_NO_CUDA_COMPILATION();
374#endif
375} // namespace Arcane::Accelerator::impl
376
377/*---------------------------------------------------------------------------*/
378/*---------------------------------------------------------------------------*/
386template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
387_applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
388 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
389{
390#if defined(ARCANE_COMPILING_HIP)
391 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
392 auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
393 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
394 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
395#else
396 ARCANE_UNUSED(launch_info);
397 ARCANE_UNUSED(kernel);
398 ARCANE_UNUSED(func);
399 ARCANE_UNUSED(args);
400 ARCANE_FATAL_NO_HIP_COMPILATION();
401#endif
402}
403
404/*---------------------------------------------------------------------------*/
405/*---------------------------------------------------------------------------*/
413template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
414void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
415 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... remaining_args)
416{
417#if defined(ARCANE_COMPILING_SYCL)
418 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
419 sycl::event event;
420 if constexpr (sizeof...(RemainingArgs) > 0) {
421 auto tbi = launch_info.kernelLaunchArgs();
422 Int32 b = tbi.nbBlockPerGrid();
423 Int32 t = tbi.nbThreadPerBlock();
424 sycl::nd_range<1> loop_size(b * t, t);
425 event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
426 }
427 else {
428 sycl::range<1> loop_size = launch_info.totalLoopSize();
429 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
430 }
431 launch_info._addSyclEvent(&event);
432#else
433 ARCANE_UNUSED(launch_info);
434 ARCANE_UNUSED(kernel);
435 ARCANE_UNUSED(func);
436 ARCANE_UNUSED(args);
437 ARCANE_FATAL_NO_SYCL_COMPILATION();
438#endif
439}
440
441/*---------------------------------------------------------------------------*/
442/*---------------------------------------------------------------------------*/
443
444} // End namespace Arcane::Accelerator::impl
445
446/*---------------------------------------------------------------------------*/
447/*---------------------------------------------------------------------------*/
448
449#define ARCANE_MACRO_PARENS ()
450
451// Les trois macros suivantes permettent de générer récursivement un ensemble
452// de paramètres. Si on veut supporter plus de paramètres, on peut ajouter
453// des appels à la macro suivante dans chaque macro.
454// Plus d'info ici: https://stackoverflow.com/questions/70238923/how-to-expand-a-recursive-macro-via-va-opt-in-a-nested-context
455#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
456#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
457#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
458
459#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
460 , decltype(a1)& a1 \
461 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
462
463#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
464
465/*
466 * \brief Macro pour générer les arguments de la lambda.
467 *
468 * Cette macro est interne à Arcane et ne doit pas être utilisée en dehors de Arcane.
469 *
470 * Cette macro permet de générer pour chaque argument \a arg une valeur `decltype(arg)& arg`.
471 *
472 * Par exemple:
473 * \code
474 * ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(value1,value2)
475 * // Cela génère le code suivant:
476 * , decltype(value1)&, decltype(value2)&
477 * \encode
478 */
479#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
480 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
481
482
483/*---------------------------------------------------------------------------*/
484/*---------------------------------------------------------------------------*/
485
486#endif
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Arguments pour lancer un kernel.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgsAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
static ARCCORE_DEVICE void applyRemainingArgsAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
Object temporaire pour conserver les informations d'exécution d'une commande et regrouper les tests.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
KernelLaunchArgs _threadBlockInfo(const void *func, Int32 shared_memory_size) const
Détermine la configuration du kernel.
friend void _applyKernelCUDA(impl::RunCommandLaunchInfo &launch_info, const CudaKernel &kernel, Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel CUDA.
friend void _applyKernelHIP(impl::RunCommandLaunchInfo &launch_info, const HipKernel &kernel, const Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel HIP.
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.