12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
49namespace Arcane::Accelerator::impl
56 using reference_type = value_type&;
57 value_type m_private_copy;
59 ARCCORE_HOST_DEVICE Privatizer(
const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
64ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
79 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
83 (remaining_args._internalExecWorkItemAtBegin(index), ...);
87 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
91 (remaining_args._internalExecWorkItemAtEnd(index), ...);
94#if defined(ARCANE_COMPILING_SYCL)
96 template <
typename... RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
100 (remaining_args._internalExecWorkItemAtBegin(x), ...);
104 template <
typename... RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
108 (remaining_args._internalExecWorkItemAtEnd(x), ...);
116#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
118template <
typename BuilderType,
typename Lambda> __global__
void
119doIndirectGPULambda(SmallSpan<const Int32> ids, Lambda func)
123 auto privatizer = privatize(func);
124 auto& body = privatizer.privateCopy();
126 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
127 if (i < ids.size()) {
131 body(BuilderType::create(i, lid));
135template <
typename ItemType,
typename Lambda> __global__
void
136doDirectGPULambda(
Int32 vsize, Lambda func)
138 auto privatizer = privatize(func);
139 auto& body = privatizer.privateCopy();
141 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
149template <
typename LoopBoundType,
typename Lambda> __global__
void
150doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < bounds.nbElement()) {
157 body(bounds.getIndices(i));
161template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs> __global__
void
162doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
164 using BuilderType = TraitsType::BuilderType;
168 auto privatizer = privatize(func);
169 auto& body = privatizer.privateCopy();
171 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
174 if (i < ids.size()) {
176 body(BuilderType::create(i, lid), remaining_args...);
181template <
typename ItemType,
typename Lambda,
typename... RemainingArgs> __global__
void
182doDirectGPULambda2(
Int32 vsize, Lambda func, RemainingArgs... remaining_args)
185 auto privatizer = privatize(func);
186 auto& body = privatizer.privateCopy();
188 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
192 body(i, remaining_args...);
197template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs> __global__
void
198doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
201 auto privatizer = privatize(func);
202 auto& body = privatizer.privateCopy();
204 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
207 if (i < bounds.nbElement()) {
208 body(bounds.getIndices(i), remaining_args...);
221#if defined(ARCANE_COMPILING_SYCL)
224template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
225class DoDirectSYCLLambdaArrayBounds
229 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
const
231 auto privatizer = privatize(func);
232 auto& body = privatizer.privateCopy();
234 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
235 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
236 if (i < bounds.nbElement()) {
237 body(bounds.getIndices(i), remaining_args...);
239 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
241 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
243 auto privatizer = privatize(func);
244 auto& body = privatizer.privateCopy();
246 Int32 i =
static_cast<Int32
>(x);
247 if (i < bounds.nbElement()) {
248 body(bounds.getIndices(i));
254template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs>
255class DoIndirectSYCLLambda
259 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
const
261 using BuilderType = TraitsType::BuilderType;
263 auto privatizer = privatize(func);
264 auto& body = privatizer.privateCopy();
266 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
267 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
268 if (i < ids.size()) {
270 body(BuilderType::create(i, lid), remaining_args...);
272 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
274 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
276 using BuilderType = TraitsType::BuilderType;
278 auto privatizer = privatize(func);
279 auto& body = privatizer.privateCopy();
281 Int32 i =
static_cast<Int32
>(x);
282 if (i < ids.size()) {
284 body(BuilderType::create(i, lid));
294template<
typename Lambda>
297 auto privatizer = privatize(func);
298 auto& body = privatizer.privateCopy();
300 for(
Int32 i=0; i<size; ++i ){
310template<
typename Lambda,
typename... LambdaArgs>
311inline void invalidKernel(Lambda&,
const LambdaArgs&...)
316template<
typename Lambda,
typename... LambdaArgs>
324#if defined(ARCANE_COMPILING_CUDA)
325template <
typename... KernelArgs>
inline void
327 cudaStream_t& s,
Int32 shared_memory,
328 const void* kernel_ptr, KernelArgs... args)
330 void* all_args[] = { (
reinterpret_cast<void*
>(&args))... };
332 cudaLaunchCooperativeKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
334 cudaLaunchKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
351template <
typename CudaKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
353 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
355#if defined(ARCANE_COMPILING_CUDA)
356 Int32 shared_memory = launch_info._sharedMemorySize();
357 const void* kernel_ptr =
reinterpret_cast<const void*
>(kernel);
359 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
360 bool is_cooperative = launch_info._isUseCooperativeLaunch();
361 bool use_cuda_launch = launch_info._isUseCudaLaunchKernel();
362 if (use_cuda_launch || is_cooperative)
363 _applyKernelCUDAVariadic(is_cooperative, tbi, s, shared_memory, kernel_ptr, args, func, other_args...);
366 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), shared_memory, s>>>(args, func, other_args...);
369 ARCANE_UNUSED(launch_info);
370 ARCANE_UNUSED(kernel);
373 ARCANE_FATAL_NO_CUDA_COMPILATION();
386template <
typename HipKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
388 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
390#if defined(ARCANE_COMPILING_HIP)
391 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
392 auto tbi = launch_info.
_threadBlockInfo(
reinterpret_cast<const void*
>(kernel), wanted_shared_memory);
393 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
394 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
396 ARCANE_UNUSED(launch_info);
397 ARCANE_UNUSED(kernel);
400 ARCANE_FATAL_NO_HIP_COMPILATION();
413template <
typename SyclKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
415 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... remaining_args)
417#if defined(ARCANE_COMPILING_SYCL)
418 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
420 if constexpr (
sizeof...(RemainingArgs) > 0) {
422 Int32 b = tbi.nbBlockPerGrid();
423 Int32 t = tbi.nbThreadPerBlock();
424 sycl::nd_range<1> loop_size(b * t, t);
425 event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
429 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
431 launch_info._addSyclEvent(&event);
433 ARCANE_UNUSED(launch_info);
434 ARCANE_UNUSED(kernel);
437 ARCANE_FATAL_NO_SYCL_COMPILATION();
449#define ARCANE_MACRO_PARENS ()
455#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
456#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
457#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
459#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
461 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
463#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
479#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
480 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Arguments pour lancer un kernel.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgsAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
static ARCCORE_DEVICE void applyRemainingArgsAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
Object temporaire pour conserver les informations d'exécution d'une commande et regrouper les tests.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
KernelLaunchArgs _threadBlockInfo(const void *func, Int32 shared_memory_size) const
Détermine la configuration du kernel.
friend void _applyKernelCUDA(impl::RunCommandLaunchInfo &launch_info, const CudaKernel &kernel, Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel CUDA.
friend void _applyKernelHIP(impl::RunCommandLaunchInfo &launch_info, const HipKernel &kernel, const Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel HIP.
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.