12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
49namespace Arcane::Accelerator::impl
56 using reference_type = value_type&;
57 value_type m_private_copy;
59 ARCCORE_HOST_DEVICE Privatizer(
const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
64ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
79 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
83 (remaining_args._internalExecWorkItemAtBegin(index), ...);
87 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
91 (remaining_args._internalExecWorkItemAtEnd(index), ...);
94#if defined(ARCANE_COMPILING_SYCL)
96 template <
typename... RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
100 (remaining_args._internalExecWorkItemAtBegin(x), ...);
104 template <
typename... RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
108 (remaining_args._internalExecWorkItemAtEnd(x), ...);
116#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
118template <
typename BuilderType,
typename Lambda> __global__
void
119doIndirectGPULambda(SmallSpan<const Int32> ids, Lambda func)
123 auto privatizer = privatize(func);
124 auto& body = privatizer.privateCopy();
126 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
127 if (i < ids.size()) {
131 body(BuilderType::create(i, lid));
135template <
typename ItemType,
typename Lambda> __global__
void
136doDirectGPULambda(
Int32 vsize, Lambda func)
138 auto privatizer = privatize(func);
139 auto& body = privatizer.privateCopy();
141 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
149template <
typename LoopBoundType,
typename Lambda> __global__
void
150doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < bounds.nbElement()) {
157 body(bounds.getIndices(i));
161template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs> __global__
void
162doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
164 using BuilderType = TraitsType::BuilderType;
168 auto privatizer = privatize(func);
169 auto& body = privatizer.privateCopy();
171 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
174 if (i < ids.size()) {
176 body(BuilderType::create(i, lid), remaining_args...);
181template <
typename ItemType,
typename Lambda,
typename... RemainingArgs> __global__
void
182doDirectGPULambda2(
Int32 vsize, Lambda func, RemainingArgs... remaining_args)
185 auto privatizer = privatize(func);
186 auto& body = privatizer.privateCopy();
188 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
192 body(i, remaining_args...);
197template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs> __global__
void
198doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
201 auto privatizer = privatize(func);
202 auto& body = privatizer.privateCopy();
204 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
207 if (i < bounds.nbElement()) {
208 body(bounds.getIndices(i), remaining_args...);
221#if defined(ARCANE_COMPILING_SYCL)
224template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
225class DoDirectSYCLLambdaArrayBounds
229 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
const
231 auto privatizer = privatize(func);
232 auto& body = privatizer.privateCopy();
234 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
235 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
236 if (i < bounds.nbElement()) {
237 body(bounds.getIndices(i), remaining_args...);
239 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
241 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
243 auto privatizer = privatize(func);
244 auto& body = privatizer.privateCopy();
246 Int32 i =
static_cast<Int32
>(x);
247 if (i < bounds.nbElement()) {
248 body(bounds.getIndices(i));
254template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs>
255class DoIndirectSYCLLambda
259 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
const
261 using BuilderType = TraitsType::BuilderType;
263 auto privatizer = privatize(func);
264 auto& body = privatizer.privateCopy();
266 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
267 KernelRemainingArgsHelper::applyRemainingArgsAtBegin(x, remaining_args...);
268 if (i < ids.size()) {
270 body(BuilderType::create(i, lid), remaining_args...);
272 KernelRemainingArgsHelper::applyRemainingArgsAtEnd(x, remaining_args...);
274 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
276 using BuilderType = TraitsType::BuilderType;
278 auto privatizer = privatize(func);
279 auto& body = privatizer.privateCopy();
281 Int32 i =
static_cast<Int32
>(x);
282 if (i < ids.size()) {
284 body(BuilderType::create(i, lid));
294template<
typename Lambda>
297 auto privatizer = privatize(func);
298 auto& body = privatizer.privateCopy();
300 for(
Int32 i=0; i<size; ++i ){
310template<
typename Lambda,
typename... LambdaArgs>
311inline void invalidKernel(Lambda&,
const LambdaArgs&...)
316template<
typename Lambda,
typename... LambdaArgs>
324#if defined(ARCANE_COMPILING_CUDA)
325template <
typename... KernelArgs>
inline void
327 cudaStream_t& s,
Int32 shared_memory,
328 const void* kernel_ptr, KernelArgs... args)
330 void* all_args[] = { (
reinterpret_cast<void*
>(&args))... };
332 cudaLaunchCooperativeKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
334 cudaLaunchKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
351template <
typename CudaKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
353 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
355#if defined(ARCANE_COMPILING_CUDA)
356 Int32 shared_memory = launch_info._sharedMemorySize();
357 const void* kernel_ptr =
reinterpret_cast<const void*
>(kernel);
358 auto tbi = launch_info._threadBlockInfo(kernel_ptr, shared_memory);
359 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
360 bool is_cooperative = launch_info._isUseCooperativeLaunch();
361 bool use_cuda_launch = launch_info._isUseCudaLaunchKernel();
362 if (use_cuda_launch || is_cooperative)
363 _applyKernelCUDAVariadic(is_cooperative, tbi, s, shared_memory, kernel_ptr, args, func, other_args...);
366 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), shared_memory, s>>>(args, func, other_args...);
369 ARCANE_UNUSED(launch_info);
370 ARCANE_UNUSED(kernel);
373 ARCANE_FATAL_NO_CUDA_COMPILATION();
386template <
typename HipKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
388 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
390#if defined(ARCANE_COMPILING_HIP)
391 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
392 auto tbi = launch_info._threadBlockInfo(
reinterpret_cast<const void*
>(kernel), wanted_shared_memory);
393 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
394 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
396 ARCANE_UNUSED(launch_info);
397 ARCANE_UNUSED(kernel);
400 ARCANE_FATAL_NO_HIP_COMPILATION();
413template <
typename SyclKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
415 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... remaining_args)
417#if defined(ARCANE_COMPILING_SYCL)
418 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
420 if constexpr (
sizeof...(RemainingArgs) > 0) {
422 Int32 b = tbi.nbBlockPerGrid();
423 Int32 t = tbi.nbThreadPerBlock();
424 sycl::nd_range<1> loop_size(b * t, t);
425 event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
429 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
431 launch_info._addSyclEvent(&event);
433 ARCANE_UNUSED(launch_info);
434 ARCANE_UNUSED(kernel);
437 ARCANE_FATAL_NO_SYCL_COMPILATION();
449#define ARCANE_MACRO_PARENS ()
455#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
456#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
457#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
459#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
461 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
463#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
479#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
480 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgsAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
static ARCCORE_DEVICE void applyRemainingArgsAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
friend void _applyKernelCUDA(impl::RunCommandLaunchInfo &launch_info, const CudaKernel &kernel, Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel CUDA.
friend void _applyKernelHIP(impl::RunCommandLaunchInfo &launch_info, const HipKernel &kernel, const Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel HIP.
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.