12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
49namespace Arcane::Accelerator::impl
56 using reference_type = value_type&;
57 value_type m_private_copy;
59 ARCCORE_HOST_DEVICE
Privatizer(
const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
64ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
79 template <
typename...
RemainingArgs>
static inline ARCCORE_DEVICE
void
86#if defined(ARCANE_COMPILING_SYCL)
88 template <
typename...
RemainingArgs>
static inline ARCCORE_HOST_DEVICE
void
100#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
102template <
typename BuilderType,
typename Lambda>
__global__ void
111 if (i < ids.size()) {
115 body(BuilderType::create(i,
lid));
119template <
typename ItemType,
typename Lambda> __global__
void
120doDirectGPULambda(
Int32 vsize, Lambda func)
122 auto privatizer = privatize(func);
123 auto& body = privatizer.privateCopy();
125 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
133template <
typename LoopBoundType,
typename Lambda> __global__
void
134doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
136 auto privatizer = privatize(func);
137 auto& body = privatizer.privateCopy();
139 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
140 if (i < bounds.nbElement()) {
141 body(bounds.getIndices(i));
145template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs> __global__
void
146doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
148 using BuilderType = TraitsType::BuilderType;
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < ids.size()) {
158 body(BuilderType::create(i, lid), remaining_args...);
163template <
typename ItemType,
typename Lambda,
typename... RemainingArgs> __global__
void
164doDirectGPULambda2(
Int32 vsize, Lambda func, RemainingArgs... remaining_args)
167 auto privatizer = privatize(func);
168 auto& body = privatizer.privateCopy();
170 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
172 body(i, remaining_args...);
177template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs> __global__
void
178doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
181 auto privatizer = privatize(func);
182 auto& body = privatizer.privateCopy();
184 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
185 if (i < bounds.nbElement()) {
186 body(bounds.getIndices(i), remaining_args...);
199#if defined(ARCANE_COMPILING_SYCL)
202template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
203class DoDirectSYCLLambdaArrayBounds
207 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
const
209 auto privatizer = privatize(func);
210 auto& body = privatizer.privateCopy();
212 Int32 i =
static_cast<Int32>(x.get_global_id(0));
213 if (i < bounds.nbElement()) {
214 body(bounds.getIndices(i), remaining_args...);
216 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
218 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
220 auto privatizer = privatize(func);
221 auto& body = privatizer.privateCopy();
224 if (i < bounds.nbElement()) {
225 body(bounds.getIndices(i));
231template <
typename TraitsType,
typename Lambda,
typename... RemainingArgs>
232class DoIndirectSYCLLambda
236 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
const
238 using BuilderType = TraitsType::BuilderType;
240 auto privatizer = privatize(func);
241 auto& body = privatizer.privateCopy();
243 Int32 i =
static_cast<Int32>(x.get_global_id(0));
244 if (i < ids.size()) {
246 body(BuilderType::create(i, lid), remaining_args...);
248 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
250 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
252 using BuilderType = TraitsType::BuilderType;
254 auto privatizer = privatize(func);
255 auto& body = privatizer.privateCopy();
258 if (i < ids.size()) {
260 body(BuilderType::create(i, lid));
270template<
typename Lambda>
273 auto privatizer = privatize(func);
274 auto& body = privatizer.privateCopy();
276 for(
Int32 i=0; i<size; ++i ){
286template<
typename Lambda,
typename... LambdaArgs>
287inline void invalidKernel(Lambda&,
const LambdaArgs&...)
292template<
typename Lambda,
typename... LambdaArgs>
316#if defined(ARCANE_COMPILING_CUDA)
324 ARCANE_UNUSED(kernel);
327 ARCANE_FATAL_NO_CUDA_COMPILATION();
344#if defined(ARCANE_COMPILING_HIP)
351 ARCANE_UNUSED(kernel);
354 ARCANE_FATAL_NO_HIP_COMPILATION();
371#if defined(ARCANE_COMPILING_SYCL)
372 sycl::queue s = SyclUtils::toNativeStream(
launch_info._internalNativeStream());
374 if constexpr (
sizeof...(RemainingArgs) > 0) {
383 event = s.parallel_for(
loop_size, [=](sycl::id<1> i) { kernel(i, args,
func); });
385 launch_info._addSyclEvent(&event);
387 ARCANE_UNUSED(launch_info);
388 ARCANE_UNUSED(kernel);
391 ARCANE_FATAL_NO_SYCL_COMPILATION();
403#define ARCANE_MACRO_PARENS ()
409#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
410#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
411#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
413#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
415 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
417#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
433#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
434 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgs(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels.
Object temporaire pour conserver les informations d'exécution d'une commande et regrouper les tests.
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
Int32 Integer
Type représentant un entier.
std::int32_t Int32
Type entier signé sur 32 bits.