12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::Impl::invalidKernel
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::Impl::invalidKernel
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::Impl::InvalidKernelClass
49namespace Arcane::Accelerator::Impl
63 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
66 (_doOneAtBegin(index, remaining_args), ...);
70 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
73 (_doOneAtEnd(index, remaining_args), ...);
78 template <
typename OneArg>
static inline ARCCORE_DEVICE
void
79 _doOneAtBegin(
Int32 index, OneArg& one_arg)
81 using HandlerType = OneArg::RemainingArgHandlerType;
82 HandlerType::execWorkItemAtBeginForCudaHip(one_arg, index);
84 template <
typename OneArg>
static inline ARCCORE_DEVICE
void
85 _doOneAtEnd(
Int32 index, OneArg& one_arg)
87 using HandlerType = OneArg::RemainingArgHandlerType;
88 HandlerType::execWorkItemAtEndForCudaHip(one_arg, index);
102#if defined(ARCANE_COMPILING_SYCL)
104 template <
typename... RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
106 RemainingArgs&... remaining_args)
108 (_doOneAtBegin(x, shm_view, remaining_args), ...);
112 template <
typename... RemainingArgs>
static inline void
114 RemainingArgs&... remaining_args)
116 (_doOneAtEnd(x, shm_view, remaining_args), ...);
121 template <
typename OneArg>
static void
124 using HandlerType = OneArg::RemainingArgHandlerType;
125 if constexpr (
requires { HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory); })
126 HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory);
128 HandlerType::execWorkItemAtBeginForSycl(one_arg, x);
130 template <
typename OneArg>
static void
133 using HandlerType = OneArg::RemainingArgHandlerType;
134 if constexpr (
requires { HandlerType::execWorkItemAtBeginForSycl(one_arg, x, shm_memory); })
135 HandlerType::execWorkItemAtEndForSycl(one_arg, x, shm_memory);
137 HandlerType::execWorkItemAtEndForSycl(one_arg, x);
149 using value_type = T;
150 using reference_type = value_type&;
151 value_type m_private_copy;
153 ARCCORE_HOST_DEVICE Privatizer(
const T& o)
154 : m_private_copy{ o }
156 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
160ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
168#if defined(ARCANE_COMPILING_CUDA_OR_HIP)
170template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs> __global__
void
173 using BuilderType = TraitsType::BuilderType;
177 auto privatizer = privatize(func);
178 auto& body = privatizer.privateCopy();
180 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
183 if (i < ids.
size()) {
185 body(BuilderType::create(i, lid), remaining_args...);
190template <
typename ItemType,
typename Lambda,
typename... RemainingArgs> __global__
void
191doDirectGPULambda2(
Int32 vsize, Lambda func, RemainingArgs... remaining_args)
194 auto privatizer = privatize(func);
195 auto& body = privatizer.privateCopy();
197 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
201 body(i, remaining_args...);
214#if defined(ARCANE_COMPILING_SYCL)
217template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs>
218class DoIndirectSYCLLambda
222 void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,
223 SmallSpan<const Int32> ids, Lambda func,
224 RemainingArgs... remaining_args)
const
226 using BuilderType = TraitsType::BuilderType;
228 auto privatizer = privatize(func);
229 auto& body = privatizer.privateCopy();
231 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
232 SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);
233 if (i < ids.size()) {
235 body(BuilderType::create(i, lid), remaining_args...);
237 SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);
239 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
241 using BuilderType = TraitsType::BuilderType;
243 auto privatizer = privatize(func);
244 auto& body = privatizer.privateCopy();
246 Int32 i =
static_cast<Int32
>(x);
247 if (i < ids.size()) {
249 body(BuilderType::create(i, lid));
259template<
typename Lambda>
262 auto privatizer = privatize(func);
263 auto& body = privatizer.privateCopy();
265 for(
Int32 i=0; i<size; ++i ){
275template<
typename Lambda,
typename... LambdaArgs>
276inline void invalidKernel(Lambda&,
const LambdaArgs&...)
281template<
typename Lambda,
typename... LambdaArgs>
289#if defined(ARCANE_COMPILING_CUDA)
290template <
typename... KernelArgs>
inline void
292 cudaStream_t& s,
Int32 shared_memory,
293 const void* kernel_ptr, KernelArgs... args)
295 void* all_args[] = { (
reinterpret_cast<void*
>(&args))... };
297 cudaLaunchCooperativeKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
299 cudaLaunchKernel(kernel_ptr, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), all_args, shared_memory, s);
316template <
typename CudaKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
318 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
320#if defined(ARCANE_COMPILING_CUDA)
321 Int32 shared_memory = launch_info._sharedMemorySize();
322 const void* kernel_ptr =
reinterpret_cast<const void*
>(kernel);
324 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
325 bool is_cooperative = launch_info._isUseCooperativeLaunch();
326 bool use_cuda_launch = launch_info._isUseCudaLaunchKernel();
327 if (use_cuda_launch || is_cooperative)
328 _applyKernelCUDAVariadic(is_cooperative, tbi, s, shared_memory, kernel_ptr, args, func, other_args...);
331 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), shared_memory, s>>>(args, func, other_args...);
334 ARCANE_UNUSED(launch_info);
335 ARCANE_UNUSED(kernel);
338 ARCANE_FATAL_NO_CUDA_COMPILATION();
351template <
typename HipKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
352_applyKernelHIP(RunCommandLaunchInfo& launch_info,
const HipKernel& kernel,
const Lambda& func,
353 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
355#if defined(ARCANE_COMPILING_HIP)
356 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
357 auto tbi = launch_info.
_threadBlockInfo(
reinterpret_cast<const void*
>(kernel), wanted_shared_memory);
358 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
359 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
361 ARCANE_UNUSED(launch_info);
362 ARCANE_UNUSED(kernel);
365 ARCANE_FATAL_NO_HIP_COMPILATION();
378template <
typename SyclKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
380 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... remaining_args)
382#if defined(ARCANE_COMPILING_SYCL)
383 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
387 Int32 b = tbi.nbBlockPerGrid();
388 Int32 t = tbi.nbThreadPerBlock();
389 sycl::nd_range<1> loop_size(b * t, t);
390 Int32 wanted_shared_memory = launch_info._sharedMemorySize();
393 event = s.submit([&](sycl::handler& cgh) {
394 sycl::local_accessor<std::byte> shm_acc(sycl::range<1>(wanted_shared_memory), cgh);
395 cgh.parallel_for(loop_size, [=](sycl::nd_item<1> i) {
396 std::byte* shm_ptr = shm_acc.get_multi_ptr<sycl::access::decorated::no>().get();
404 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
406 launch_info._addSyclEvent(&event);
408 ARCANE_UNUSED(launch_info);
409 ARCANE_UNUSED(kernel);
412 ARCANE_FATAL_NO_SYCL_COMPILATION();
424#define ARCANE_MACRO_PARENS ()
430#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
431#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
432#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
434#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
436 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
438#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
454#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
455 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer une opération pour les arguments supplémentaires en début et en fin de noyau CU...
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
Template pour savoir si un type utilisé comme boucle dans les kernels nécessite toujours sycl::nb_ite...
Arguments pour lancer un kernel.
Object temporaire pour conserver les informations d'exécution d'une commande et regrouper les tests.
friend void _applyKernelCUDA(RunCommandLaunchInfo &launch_info, const CudaKernel &kernel, Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel CUDA.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
KernelLaunchArgs _threadBlockInfo(const void *func, Int32 shared_memory_size) const
Détermine la configuration du kernel.
friend void _applyKernelHIP(RunCommandLaunchInfo &launch_info, const HipKernel &kernel, const Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel HIP.
Classe pour appliquer une opération pour les arguments supplémentaires en début et en fin de noyau Sy...
Vue d'un tableau d'éléments de type T.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.