12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::Impl::invalidKernel
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::Impl::invalidKernel
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::Impl::InvalidKernelClass
49namespace Arcane::Accelerator::Impl
63 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
66 (_doOneAtBegin(index, remaining_args), ...);
70 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
73 (_doOneAtEnd(index, remaining_args), ...);
78 template <
typename OneArg>
static inline ARCCORE_DEVICE
void
79 _doOneAtBegin(
Int32 index, OneArg& one_arg)
81 using HandlerType = OneArg::RemainingArgHandlerType;
82 HandlerType::execWorkItemAtBeginForCudaHip(one_arg, index);
84 template <
typename OneArg>
static inline ARCCORE_DEVICE
void
85 _doOneAtEnd(
Int32 index, OneArg& one_arg)
87 using HandlerType = OneArg::RemainingArgHandlerType;
88 HandlerType::execWorkItemAtEndForCudaHip(one_arg, index);
102#if defined(ARCANE_COMPILING_SYCL)
104 template <
typename... RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
106 RemainingArgs&... remaining_args)
108 (_doOneAtBegin(x, shm_view, remaining_args), ...);
112 template <
typename... RemainingArgs>
static inline void
114 RemainingArgs&... remaining_args)
116 (_doOneAtEnd(x, shm_view, remaining_args), ...);
121 template <
typename OneArg>
static void
124 using HandlerType = OneArg::RemainingArgHandlerType;
125 if constexpr (
requires { HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory); })
126 HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory);
128 HandlerType::execWorkItemAtBeginForSycl(one_arg, x);
130 template <
typename OneArg>
static void
133 using HandlerType = OneArg::RemainingArgHandlerType;
134 if constexpr (
requires { HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory); })
135 HandlerType::execWorkItemAtEndForSycl(one_arg, x, shm_memory);
137 HandlerType::execWorkItemAtEndForSycl(one_arg, x);
149 using value_type = T;
150 using reference_type = value_type&;
151 value_type m_private_copy;
153 ARCCORE_HOST_DEVICE Privatizer(
const T& o)
154 : m_private_copy{ o }
156 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
160ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
168#if defined(ARCANE_COMPILING_CUDA_OR_HIP)
170template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs> __global__
void
173 using BuilderType = TraitsType::BuilderType;
177 auto privatizer = privatize(func);
178 auto& body = privatizer.privateCopy();
180 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
183 if (i < ids.
size()) {
185 body(BuilderType::create(i, lid), remaining_args...);
190template <
typename ItemType,
typename Lambda,
typename... RemainingArgs> __global__
void
191doDirectGPULambda2(
Int32 vsize, Lambda func, RemainingArgs... remaining_args)
194 auto privatizer = privatize(func);
195 auto& body = privatizer.privateCopy();
197 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
201 body(i, remaining_args...);
214#if defined(ARCANE_COMPILING_SYCL)
217template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs>
218class DoIndirectSYCLLambda
222 void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,
223 SmallSpan<const Int32> ids, Lambda func,
224 RemainingArgs... remaining_args)
const
226 using BuilderType = TraitsType::BuilderType;
228 auto privatizer = privatize(func);
229 auto& body = privatizer.privateCopy();
231 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
232 SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);
233 if (i < ids.size()) {
235 body(BuilderType::create(i, lid), remaining_args...);
237 SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);
239 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
241 using BuilderType = TraitsType::BuilderType;
243 auto privatizer = privatize(func);
244 auto& body = privatizer.privateCopy();
246 Int32 i =
static_cast<Int32
>(x);
247 if (i < ids.size()) {
249 body(BuilderType::create(i, lid));
259template<
typename Lambda>
262 auto privatizer = privatize(func);
263 auto& body = privatizer.privateCopy();
265 for(
Int32 i=0; i<size; ++i ){
275template<
typename Lambda,
typename... LambdaArgs>
276inline void invalidKernel(Lambda&,
const LambdaArgs&...)
281template<
typename Lambda,
typename... LambdaArgs>
289#if defined(ARCANE_COMPILING_CUDA)
290template <
typename... KernelArgs>
inline void
291_applyKernelCUDAVariadic(
bool is_cooperative,
const KernelLaunchArgs& tbi,
292 cudaStream_t& s,
Int32 shared_memory,
293 const void* kernel_ptr, KernelArgs... args)
295 void* all_args[] = { (
reinterpret_cast<void*
>(&args))... };
297 cudaLaunchCooperativeKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
299 cudaLaunchKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
316template <
typename CudaKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
317_applyKernelCUDA(RunCommandLaunchInfo& launch_info,
const CudaKernel& kernel, Lambda& func,
318 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
320#if defined(ARCANE_COMPILING_CUDA)
321 Int32 shared_memory = launch_info._sharedMemorySize();
322 const void* kernel_ptr =
reinterpret_cast<const void*
>(kernel);
323 auto tbi = launch_info._threadBlockInfo(kernel_ptr, shared_memory);
324 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
325 bool is_cooperative = launch_info._isUseCooperativeLaunch();
326 bool use_cuda_launch = launch_info._isUseCudaLaunchKernel();
327 if (use_cuda_launch || is_cooperative)
328 _applyKernelCUDAVariadic(is_cooperative, tbi, s, shared_memory, kernel_ptr, args, func, other_args...);
331 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), shared_memory, s>>>(args, func, other_args...);
334 ARCANE_UNUSED(launch_info);
335 ARCANE_UNUSED(kernel);
338 ARCANE_FATAL_NO_CUDA_COMPILATION();
351template <
typename HipKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
352_applyKernelHIP(RunCommandLaunchInfo& launch_info,
const HipKernel& kernel,
const Lambda& func,
353 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
355#if defined(ARCANE_COMPILING_HIP)
356 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
357 auto tbi = launch_info._threadBlockInfo(
reinterpret_cast<const void*
>(kernel), wanted_shared_memory);
358 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
359 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
361 ARCANE_UNUSED(launch_info);
362 ARCANE_UNUSED(kernel);
365 ARCANE_FATAL_NO_HIP_COMPILATION();
378template <
typename SyclKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
379void _applyKernelSYCL(RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
380 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... remaining_args)
382#if defined(ARCANE_COMPILING_SYCL)
383 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
386 auto tbi = launch_info.kernelLaunchArgs();
387 Int32 b = tbi.nbBlockPerGrid();
388 Int32 t = tbi.nbThreadPerBlock();
389 sycl::nd_range<1> loop_size(b * t, t);
390 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
393 event = s.submit([&](sycl::handler& cgh) {
394 sycl::local_accessor<std::byte> shm_acc(sycl::range<1>(wanted_shared_memory), cgh);
395 cgh.parallel_for(loop_size, [=](sycl::nd_item<1> i) {
396 std::byte* shm_ptr = shm_acc.get_multi_ptr<sycl::access::decorated::no>().get();
397 kernel(i, SmallSpan<std::byte>(shm_ptr, wanted_shared_memory), args, func, remaining_args...);
403 sycl::range<1> loop_size = launch_info.totalLoopSize();
404 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
406 launch_info._addSyclEvent(&event);
408 ARCANE_UNUSED(launch_info);
409 ARCANE_UNUSED(kernel);
412 ARCANE_FATAL_NO_SYCL_COMPILATION();
424#define ARCANE_MACRO_PARENS ()
430#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
431#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
432#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
434#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
436 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
438#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
454#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
455 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer une opération pour les arguments supplémentaires en début et en fin de noyau CU...
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
Template pour savoir si un type utilisé comme boucle dans les kernels nécessite toujours sycl::nb_ite...
Classe pour appliquer une opération pour les arguments supplémentaires en début et en fin de noyau Sy...
Vue d'un tableau d'éléments de type T.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.