12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
49namespace Arcane::Accelerator::impl
56 using reference_type = value_type&;
57 value_type m_private_copy;
79 template <
typename...
RemainingArgs>
static inline ARCCORE_DEVICE
void
86#if defined(ARCANE_COMPILING_SYCL)
100#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
102template <
typename BuilderType,
typename Lambda>
__global__ void
111 if (i < ids.size()) {
115 body(BuilderType::create(i,
lid));
119template <
typename ItemType,
typename Lambda> __global__
void
120doDirectGPULambda(
Int32 vsize, Lambda func)
122 auto privatizer = privatize(func);
123 auto& body = privatizer.privateCopy();
125 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
133template <
typename LoopBoundType,
typename Lambda> __global__
void
134doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
136 auto privatizer = privatize(func);
137 auto& body = privatizer.privateCopy();
139 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
140 if (i < bounds.nbElement()) {
141 body(bounds.getIndices(i));
145template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs> __global__
void
146doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
148 using BuilderType = TraitsType::BuilderType;
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < ids.size()) {
158 body(BuilderType::create(i, lid), remaining_args...);
163template <
typename ItemType,
typename Lambda,
typename... RemainingArgs> __global__
void
164doDirectGPULambda2(
Int32 vsize, Lambda func, RemainingArgs... remaining_args)
167 auto privatizer = privatize(func);
168 auto& body = privatizer.privateCopy();
170 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
172 body(i, remaining_args...);
177template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs> __global__
void
178doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
181 auto privatizer = privatize(func);
182 auto& body = privatizer.privateCopy();
184 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
185 if (i < bounds.nbElement()) {
186 body(bounds.getIndices(i), remaining_args...);
199#if defined(ARCANE_COMPILING_SYCL)
202template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
203class DoDirectSYCLLambdaArrayBounds
207 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
const
209 auto privatizer = privatize(func);
210 auto& body = privatizer.privateCopy();
212 Int32 i =
static_cast<Int32>(x.get_global_id(0));
213 if (i < bounds.nbElement()) {
214 body(bounds.getIndices(i), remaining_args...);
216 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
218 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
220 auto privatizer = privatize(func);
221 auto& body = privatizer.privateCopy();
224 if (i < bounds.nbElement()) {
225 body(bounds.getIndices(i));
231template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs>
232class DoIndirectSYCLLambda
236 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
const
238 using BuilderType = TraitsType::BuilderType;
240 auto privatizer = privatize(func);
241 auto& body = privatizer.privateCopy();
243 Int32 i =
static_cast<Int32>(x.get_global_id(0));
244 if (i < ids.size()) {
246 body(BuilderType::create(i, lid), remaining_args...);
248 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
250 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
252 using BuilderType = TraitsType::BuilderType;
254 auto privatizer = privatize(func);
255 auto& body = privatizer.privateCopy();
258 if (i < ids.size()) {
260 body(BuilderType::create(i, lid));
270template<
typename Lambda>
271void doDirectThreadLambda(Integer begin,Integer size,Lambda func)
273 auto privatizer = privatize(func);
274 auto& body = privatizer.privateCopy();
276 for(
Int32 i=0; i<size; ++i ){
286template<
typename Lambda,
typename... LambdaArgs>
287inline void invalidKernel(Lambda&,
const LambdaArgs&...)
292template<
typename Lambda,
typename... LambdaArgs>
316#if defined(ARCANE_COMPILING_CUDA)
327 ARCANE_FATAL_NO_CUDA_COMPILATION();
344#if defined(ARCANE_COMPILING_HIP)
354 ARCANE_FATAL_NO_HIP_COMPILATION();
371#if defined(ARCANE_COMPILING_SYCL)
372 sycl::queue s = SyclUtils::toNativeStream(
launch_info._internalNativeStream());
374 if constexpr (
sizeof...(RemainingArgs) > 0) {
376 Int32
b =
tbi.nbBlockPerGrid();
377 Int32
t =
tbi.nbThreadPerBlock();
385 launch_info._addSyclEvent(&event);
387 ARCANE_UNUSED(launch_info);
388 ARCANE_UNUSED(kernel);
391 ARCANE_FATAL_NO_SYCL_COMPILATION();
403#define ARCANE_MACRO_PARENS ()
409#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
410#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
411#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
413#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
415 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
417#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
433#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
434 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgs(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels.
Référence à une instance.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.