12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
49namespace Arcane::Accelerator::impl
56 using reference_type = value_type&;
57 value_type m_private_copy;
59 ARCCORE_HOST_DEVICE
Privatizer(
const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
64ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
79 template <
typename... RemainingArgs>
static inline ARCCORE_DEVICE
void
83 (remaining_args._internalExecWorkItem(index), ...);
86#if defined(ARCANE_COMPILING_SYCL)
88 template <
typename... RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
92 (remaining_args._internalExecWorkItem(x), ...);
100#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
102template <
typename BuilderType,
typename Lambda> __global__
void
107 auto privatizer = privatize(func);
108 auto& body = privatizer.privateCopy();
110 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
111 if (i < ids.size()) {
115 body(BuilderType::create(i, lid));
119template <
typename ItemType,
typename Lambda> __global__
void
120doDirectGPULambda(Int32 vsize, Lambda func)
122 auto privatizer = privatize(func);
123 auto& body = privatizer.privateCopy();
125 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
133template <
typename LoopBoundType,
typename Lambda> __global__
void
134doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
136 auto privatizer = privatize(func);
137 auto& body = privatizer.privateCopy();
139 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
140 if (i < bounds.nbElement()) {
141 body(bounds.getIndices(i));
145template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs> __global__
void
146doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
148 using BuilderType = TraitsType::BuilderType;
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < ids.size()) {
158 body(BuilderType::create(i, lid), remaining_args...);
163template <
typename ItemType,
typename Lambda,
typename... RemainingArgs> __global__
void
164doDirectGPULambda2(Int32 vsize, Lambda func, RemainingArgs... remaining_args)
167 auto privatizer = privatize(func);
168 auto& body = privatizer.privateCopy();
170 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
172 body(i, remaining_args...);
177template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs> __global__
void
178doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
181 auto privatizer = privatize(func);
182 auto& body = privatizer.privateCopy();
184 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
185 if (i < bounds.nbElement()) {
186 body(bounds.getIndices(i), remaining_args...);
199#if defined(ARCANE_COMPILING_SYCL)
202template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
203class DoDirectSYCLLambdaArrayBounds
207 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
const
209 auto privatizer = privatize(func);
210 auto& body = privatizer.privateCopy();
212 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
213 if (i < bounds.nbElement()) {
214 body(bounds.getIndices(i), remaining_args...);
216 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
218 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
220 auto privatizer = privatize(func);
221 auto& body = privatizer.privateCopy();
223 Int32 i =
static_cast<Int32
>(x);
224 if (i < bounds.nbElement()) {
225 body(bounds.getIndices(i));
231template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs>
232class DoIndirectSYCLLambda
236 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
const
238 using BuilderType = TraitsType::BuilderType;
240 auto privatizer = privatize(func);
241 auto& body = privatizer.privateCopy();
243 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
244 if (i < ids.size()) {
246 body(BuilderType::create(i, lid), remaining_args...);
248 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
250 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
252 using BuilderType = TraitsType::BuilderType;
254 auto privatizer = privatize(func);
255 auto& body = privatizer.privateCopy();
257 Int32 i =
static_cast<Int32
>(x);
258 if (i < ids.size()) {
260 body(BuilderType::create(i, lid));
270template<
typename Lambda>
273 auto privatizer = privatize(func);
274 auto& body = privatizer.privateCopy();
276 for( Int32 i=0; i<size; ++i ){
286template<
typename Lambda,
typename... LambdaArgs>
287inline void invalidKernel(Lambda&,
const LambdaArgs&...)
292template<
typename Lambda,
typename... LambdaArgs>
312template <
typename CudaKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
314 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
316#if defined(ARCANE_COMPILING_CUDA)
317 Int32 wanted_shared_memory = 0;
318 auto tbi = launch_info._threadBlockInfo(
reinterpret_cast<const void*
>(kernel), wanted_shared_memory);
319 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
321 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s>>>(args, func, other_args...);
323 ARCANE_UNUSED(launch_info);
324 ARCANE_UNUSED(kernel);
327 ARCANE_FATAL_NO_CUDA_COMPILATION();
340template <
typename HipKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
342 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
344#if defined(ARCANE_COMPILING_HIP)
345 Int32 wanted_shared_memory = 0;
346 auto tbi = launch_info._threadBlockInfo(
reinterpret_cast<const void*
>(kernel), wanted_shared_memory);
347 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
348 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
350 ARCANE_UNUSED(launch_info);
351 ARCANE_UNUSED(kernel);
354 ARCANE_FATAL_NO_HIP_COMPILATION();
367template <
typename SyclKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
369 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... remaining_args)
371#if defined(ARCANE_COMPILING_SYCL)
372 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
374 if constexpr (
sizeof...(RemainingArgs) > 0) {
376 Int32 b = tbi.nbBlockPerGrid();
377 Int32 t = tbi.nbThreadPerBlock();
378 sycl::nd_range<1> loop_size(b * t, t);
379 event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
383 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
385 launch_info._addSyclEvent(&event);
387 ARCANE_UNUSED(launch_info);
388 ARCANE_UNUSED(kernel);
391 ARCANE_FATAL_NO_SYCL_COMPILATION();
403#define ARCANE_MACRO_PARENS ()
409#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
410#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
411#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
413#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
415 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
417#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
433#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
434 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgs(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
Vue d'un tableau d'éléments de type T.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
Int32 Integer
Type représentant un entier.
std::int32_t Int32
Type entier signé sur 32 bits.