Arcane  v4.1.1.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
KernelLauncher.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* KernelLauncher.h (C) 2000-2025 */
9/* */
10/* Gestion du lancement des noyaux de calcul sur accélérateur. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
19
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
24
25/*---------------------------------------------------------------------------*/
26/*---------------------------------------------------------------------------*/
27
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
30#else
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::Impl::invalidKernel
32#endif
33
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
36#else
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::Impl::invalidKernel
38#endif
39
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
42#else
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::Impl::InvalidKernelClass
44#endif
45
46/*---------------------------------------------------------------------------*/
47/*---------------------------------------------------------------------------*/
48
49namespace Arcane::Accelerator::Impl
50{
51
52/*---------------------------------------------------------------------------*/
53/*---------------------------------------------------------------------------*/
59{
60 public:
61
63 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
64 applyAtBegin(Int32 index, RemainingArgs&... remaining_args)
65 {
66 (_doOneAtBegin(index, remaining_args), ...);
67 }
68
70 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
71 applyAtEnd(Int32 index, RemainingArgs&... remaining_args)
72 {
73 (_doOneAtEnd(index, remaining_args), ...);
74 }
75
76 private:
77
78 template <typename OneArg> static inline ARCCORE_DEVICE void
79 _doOneAtBegin(Int32 index, OneArg& one_arg)
80 {
81 using HandlerType = OneArg::RemainingArgHandlerType;
82 HandlerType::execWorkItemAtBeginForCudaHip(one_arg, index);
83 }
84 template <typename OneArg> static inline ARCCORE_DEVICE void
85 _doOneAtEnd(Int32 index, OneArg& one_arg)
86 {
87 using HandlerType = OneArg::RemainingArgHandlerType;
88 HandlerType::execWorkItemAtEndForCudaHip(one_arg, index);
89 }
90};
91
92/*---------------------------------------------------------------------------*/
93/*---------------------------------------------------------------------------*/
99{
100 public:
101
102#if defined(ARCANE_COMPILING_SYCL)
104 template <typename... RemainingArgs> static inline ARCCORE_HOST_DEVICE void
105 applyAtBegin(sycl::nd_item<1> x, SmallSpan<std::byte> shm_view,
106 RemainingArgs&... remaining_args)
107 {
108 (_doOneAtBegin(x, shm_view, remaining_args), ...);
109 }
110
112 template <typename... RemainingArgs> static inline void
113 applyAtEnd(sycl::nd_item<1> x, SmallSpan<std::byte> shm_view,
114 RemainingArgs&... remaining_args)
115 {
116 (_doOneAtEnd(x, shm_view, remaining_args), ...);
117 }
118
119 private:
120
121 template <typename OneArg> static void
122 _doOneAtBegin(sycl::nd_item<1> x, SmallSpan<std::byte> shm_memory, OneArg& one_arg)
123 {
124 using HandlerType = OneArg::RemainingArgHandlerType;
125 if constexpr (requires { HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory); })
126 HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory);
127 else
128 HandlerType::execWorkItemAtBeginForSycl(one_arg, x);
129 }
130 template <typename OneArg> static void
131 _doOneAtEnd(sycl::nd_item<1> x, SmallSpan<std::byte> shm_memory, OneArg& one_arg)
132 {
133 using HandlerType = OneArg::RemainingArgHandlerType;
134 if constexpr (requires { HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory); })
135 HandlerType::execWorkItemAtEndForSycl(one_arg, x, shm_memory);
136 else
137 HandlerType::execWorkItemAtEndForSycl(one_arg, x);
138 }
139
140#endif
141};
142
143/*---------------------------------------------------------------------------*/
144/*---------------------------------------------------------------------------*/
145
146template <typename T>
147struct Privatizer
148{
149 using value_type = T;
150 using reference_type = value_type&;
151 value_type m_private_copy;
152
153 ARCCORE_HOST_DEVICE Privatizer(const T& o)
154 : m_private_copy{ o }
155 {}
156 ARCCORE_HOST_DEVICE reference_type privateCopy() { return m_private_copy; }
157};
158
159template <typename T>
160ARCCORE_HOST_DEVICE auto privatize(const T& item) -> Privatizer<T>
161{
162 return Privatizer<T>{ item };
163}
164
165/*---------------------------------------------------------------------------*/
166/*---------------------------------------------------------------------------*/
167
168#if defined(ARCANE_COMPILING_CUDA_OR_HIP)
169
170template <typename TraitsType, typename Lambda, typename... RemainingArgs> __global__ void
171doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
172{
173 using BuilderType = TraitsType::BuilderType;
174 using LocalIdType = BuilderType::ValueType;
175
176 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
177 auto privatizer = privatize(func);
178 auto& body = privatizer.privateCopy();
179
180 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
181
183 if (i < ids.size()) {
184 LocalIdType lid(ids[i]);
185 body(BuilderType::create(i, lid), remaining_args...);
186 }
188}
189
190template <typename ItemType, typename Lambda, typename... RemainingArgs> __global__ void
191doDirectGPULambda2(Int32 vsize, Lambda func, RemainingArgs... remaining_args)
192{
193 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
194 auto privatizer = privatize(func);
195 auto& body = privatizer.privateCopy();
196
197 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
198
200 if (i < vsize) {
201 body(i, remaining_args...);
202 }
204}
205
206/*---------------------------------------------------------------------------*/
207/*---------------------------------------------------------------------------*/
208
209#endif // ARCANE_COMPILING_CUDA_OR_HIP
210
211/*---------------------------------------------------------------------------*/
212/*---------------------------------------------------------------------------*/
213
214#if defined(ARCANE_COMPILING_SYCL)
215
217template <typename TraitsType, typename Lambda, typename... RemainingArgs>
218class DoIndirectSYCLLambda
219{
220 public:
221
222 void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,
223 SmallSpan<const Int32> ids, Lambda func,
224 RemainingArgs... remaining_args) const
225 {
226 using BuilderType = TraitsType::BuilderType;
227 using LocalIdType = BuilderType::ValueType;
228 auto privatizer = privatize(func);
229 auto& body = privatizer.privateCopy();
230
231 Int32 i = static_cast<Int32>(x.get_global_id(0));
232 SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);
233 if (i < ids.size()) {
234 LocalIdType lid(ids[i]);
235 body(BuilderType::create(i, lid), remaining_args...);
236 }
237 SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);
238 }
239 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func) const
240 {
241 using BuilderType = TraitsType::BuilderType;
242 using LocalIdType = BuilderType::ValueType;
243 auto privatizer = privatize(func);
244 auto& body = privatizer.privateCopy();
245
246 Int32 i = static_cast<Int32>(x);
247 if (i < ids.size()) {
248 LocalIdType lid(ids[i]);
249 body(BuilderType::create(i, lid));
250 }
251 }
252};
253
254#endif
255
256/*---------------------------------------------------------------------------*/
257/*---------------------------------------------------------------------------*/
258
259template<typename Lambda>
260void doDirectThreadLambda(Integer begin,Integer size,Lambda func)
261{
262 auto privatizer = privatize(func);
263 auto& body = privatizer.privateCopy();
264
265 for( Int32 i=0; i<size; ++i ){
266 func(begin+i);
267 }
268}
269
270/*---------------------------------------------------------------------------*/
271/*---------------------------------------------------------------------------*/
272
273// Fonction vide pour simuler un noyau invalide car non compilé avec
274// le compilateur adéquant. Ne devrait normalement pas être appelé.
275template<typename Lambda,typename... LambdaArgs>
276inline void invalidKernel(Lambda&,const LambdaArgs&...)
277{
278 ARCANE_FATAL("Invalid kernel");
279}
280
281template<typename Lambda,typename... LambdaArgs>
283{
284};
285
286/*---------------------------------------------------------------------------*/
287/*---------------------------------------------------------------------------*/
288
289#if defined(ARCANE_COMPILING_CUDA)
290template <typename... KernelArgs> inline void
291_applyKernelCUDAVariadic(bool is_cooperative, const KernelLaunchArgs& tbi,
292 cudaStream_t& s, Int32 shared_memory,
293 const void* kernel_ptr, KernelArgs... args)
294{
295 void* all_args[] = { (reinterpret_cast<void*>(&args))... };
296 if (is_cooperative)
297 cudaLaunchCooperativeKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
298 else
299 cudaLaunchKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
300}
301#endif
302
303/*---------------------------------------------------------------------------*/
304/*---------------------------------------------------------------------------*/
316template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
317_applyKernelCUDA(RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
318 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
319{
320#if defined(ARCANE_COMPILING_CUDA)
321 Int32 shared_memory = launch_info._sharedMemorySize();
322 const void* kernel_ptr = reinterpret_cast<const void*>(kernel);
323 auto tbi = launch_info._threadBlockInfo(kernel_ptr, shared_memory);
324 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
325 bool is_cooperative = launch_info._isUseCooperativeLaunch();
326 bool use_cuda_launch = launch_info._isUseCudaLaunchKernel();
327 if (use_cuda_launch || is_cooperative)
328 _applyKernelCUDAVariadic(is_cooperative, tbi, s, shared_memory, kernel_ptr, args, func, other_args...);
329 else {
330 // TODO: utiliser cudaLaunchKernel() à la place.
331 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), shared_memory, s>>>(args, func, other_args...);
332 }
333#else
334 ARCANE_UNUSED(launch_info);
335 ARCANE_UNUSED(kernel);
336 ARCANE_UNUSED(func);
337 ARCANE_UNUSED(args);
338 ARCANE_FATAL_NO_CUDA_COMPILATION();
339#endif
340}
341
342/*---------------------------------------------------------------------------*/
343/*---------------------------------------------------------------------------*/
351template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
352_applyKernelHIP(RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
353 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
354{
355#if defined(ARCANE_COMPILING_HIP)
356 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
357 auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
358 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
359 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
360#else
361 ARCANE_UNUSED(launch_info);
362 ARCANE_UNUSED(kernel);
363 ARCANE_UNUSED(func);
364 ARCANE_UNUSED(args);
365 ARCANE_FATAL_NO_HIP_COMPILATION();
366#endif
367}
368
369/*---------------------------------------------------------------------------*/
370/*---------------------------------------------------------------------------*/
378template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
379void _applyKernelSYCL(RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
380 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... remaining_args)
381{
382#if defined(ARCANE_COMPILING_SYCL)
383 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
384 sycl::event event;
385 if constexpr (IsAlwaysUseSyclNdItem<LambdaArgs>::value || sizeof...(RemainingArgs) > 0) {
386 auto tbi = launch_info.kernelLaunchArgs();
387 Int32 b = tbi.nbBlockPerGrid();
388 Int32 t = tbi.nbThreadPerBlock();
389 sycl::nd_range<1> loop_size(b * t, t);
390 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
391 // TODO: regarder s'il y a un coût à utiliser à chaque fois
392 // 'sycl::local_accessor' même si on n'a pas besoin de mémoire partagée.
393 event = s.submit([&](sycl::handler& cgh) {
394 sycl::local_accessor<std::byte> shm_acc(sycl::range<1>(wanted_shared_memory), cgh);
395 cgh.parallel_for(loop_size, [=](sycl::nd_item<1> i) {
396 std::byte* shm_ptr = shm_acc.get_multi_ptr<sycl::access::decorated::no>().get();
397 kernel(i, SmallSpan<std::byte>(shm_ptr, wanted_shared_memory), args, func, remaining_args...);
398 });
399 });
400 //event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
401 }
402 else {
403 sycl::range<1> loop_size = launch_info.totalLoopSize();
404 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
405 }
406 launch_info._addSyclEvent(&event);
407#else
408 ARCANE_UNUSED(launch_info);
409 ARCANE_UNUSED(kernel);
410 ARCANE_UNUSED(func);
411 ARCANE_UNUSED(args);
412 ARCANE_FATAL_NO_SYCL_COMPILATION();
413#endif
414}
415
416/*---------------------------------------------------------------------------*/
417/*---------------------------------------------------------------------------*/
418
419} // namespace Arcane::Accelerator::Impl
420
421/*---------------------------------------------------------------------------*/
422/*---------------------------------------------------------------------------*/
423
424#define ARCANE_MACRO_PARENS ()
425
426// Les trois macros suivantes permettent de générer récursivement un ensemble
427// de paramètres. Si on veut supporter plus de paramètres, on peut ajouter
428// des appels à la macro suivante dans chaque macro.
429// Plus d'info ici: https://stackoverflow.com/questions/70238923/how-to-expand-a-recursive-macro-via-va-opt-in-a-nested-context
430#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
431#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
432#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
433
434#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
435 , decltype(a1)& a1 \
436 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
437
438#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
439
440/*
441 * \brief Macro pour générer les arguments de la lambda.
442 *
443 * Cette macro est interne à Arcane et ne doit pas être utilisée en dehors de Arcane.
444 *
445 * Cette macro permet de générer pour chaque argument \a arg une valeur `decltype(arg)& arg`.
446 *
447 * Par exemple:
448 * \code
449 * ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(value1,value2)
450 * // Cela génère le code suivant:
451 * , decltype(value1)&, decltype(value2)&
452 * \encode
453 */
454#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
455 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
456
457
458/*---------------------------------------------------------------------------*/
459/*---------------------------------------------------------------------------*/
460
461#endif
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer une opération pour les arguments supplémentaires en début et en fin de noyau CU...
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
Template pour savoir si un type utilisé comme boucle dans les kernels nécessite toujours sycl::nb_ite...
Object temporaire pour conserver les informations d'exécution d'une commande et regrouper les tests.
friend void _applyKernelCUDA(RunCommandLaunchInfo &launch_info, const CudaKernel &kernel, Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel CUDA.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
KernelLaunchArgs _threadBlockInfo(const void *func, Int32 shared_memory_size) const
Détermine la configuration du kernel.
friend void _applyKernelHIP(RunCommandLaunchInfo &launch_info, const HipKernel &kernel, const Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel HIP.
Classe pour appliquer une opération pour les arguments supplémentaires en début et en fin de noyau Sy...
Vue d'un tableau d'éléments de type T.
Definition Span.h:801
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:325
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.