12#ifndef ARCANE_ACCELERATOR_RUNQUEUEINTERNAL_H
13#define ARCANE_ACCELERATOR_RUNQUEUEINTERNAL_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/AcceleratorGlobal.h"
21#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#if defined(ARCANE_COMPILING_HIP)
24#include <hip/hip_runtime.h>
26#if defined(ARCANE_COMPILING_SYCL)
27#include <sycl/sycl.hpp>
35#if defined(ARCANE_COMPILING_CUDA)
36#define ARCANE_KERNEL_CUDA_FUNC(a) a
38#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
41#if defined(ARCANE_COMPILING_HIP)
42#define ARCANE_KERNEL_HIP_FUNC(a) a
44#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
47#if defined(ARCANE_COMPILING_SYCL)
48#define ARCANE_KERNEL_SYCL_FUNC(a) a
50#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
56namespace Arcane::Accelerator::impl
63 using reference_type = value_type&;
64 value_type m_private_copy;
66 ARCCORE_HOST_DEVICE
Privatizer(
const T& o) : m_private_copy{o} {}
67 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
71ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
86 template <
typename... ReducerArgs>
static inline ARCCORE_DEVICE
void
90 (reducer_args._internalExecWorkItem(index), ...);
93#if defined(ARCANE_COMPILING_SYCL)
95 template <
typename... ReducerArgs>
static inline ARCCORE_HOST_DEVICE
void
99 (reducer_args._internalExecWorkItem(x), ...);
107#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
109template <
typename BuilderType,
typename Lambda> __global__
void
114 auto privatizer = privatize(func);
115 auto& body = privatizer.privateCopy();
117 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
118 if (i < ids.size()) {
122 body(BuilderType::create(i, lid));
126template <
typename ItemType,
typename Lambda> __global__
void
127doDirectGPULambda(Int32 vsize, Lambda func)
129 auto privatizer = privatize(func);
130 auto& body = privatizer.privateCopy();
132 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
140template <
typename LoopBoundType,
typename Lambda> __global__
void
141doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
143 auto privatizer = privatize(func);
144 auto& body = privatizer.privateCopy();
146 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
147 if (i < bounds.nbElement()) {
148 body(bounds.getIndices(i));
152template <
typename TraitsType,
typename Lambda,
typename... ReducerArgs> __global__
void
153doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, ReducerArgs... reducer_args)
155 using BuilderType = TraitsType::BuilderType;
159 auto privatizer = privatize(func);
160 auto& body = privatizer.privateCopy();
162 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
163 if (i < ids.size()) {
165 body(BuilderType::create(i, lid), reducer_args...);
170template <
typename ItemType,
typename Lambda,
typename... ReducerArgs> __global__
void
171doDirectGPULambda2(Int32 vsize, Lambda func, ReducerArgs... reducer_args)
174 auto privatizer = privatize(func);
175 auto& body = privatizer.privateCopy();
177 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
179 body(i, reducer_args...);
184template <
typename LoopBoundType,
typename Lambda,
typename... ReducerArgs> __global__
void
185doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, ReducerArgs... reducer_args)
188 auto privatizer = privatize(func);
189 auto& body = privatizer.privateCopy();
191 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
192 if (i < bounds.nbElement()) {
193 body(bounds.getIndices(i), reducer_args...);
206#if defined(ARCANE_COMPILING_SYCL)
209template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
210class DoDirectSYCLLambdaArrayBounds
214 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... reducer_args)
const
216 auto privatizer = privatize(func);
217 auto& body = privatizer.privateCopy();
219 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
220 if (i < bounds.nbElement()) {
221 body(bounds.getIndices(i), reducer_args...);
223 KernelReducerHelper::applyReducerArgs(x, reducer_args...);
225 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
227 auto privatizer = privatize(func);
228 auto& body = privatizer.privateCopy();
230 Int32 i =
static_cast<Int32
>(x);
231 if (i < bounds.nbElement()) {
232 body(bounds.getIndices(i));
238template <
typename TraitsType,
typename Lambda,
typename... ReducerArgs>
239class DoIndirectSYCLLambda
243 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, ReducerArgs... reducer_args)
const
245 using BuilderType = TraitsType::BuilderType;
247 auto privatizer = privatize(func);
248 auto& body = privatizer.privateCopy();
250 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
251 if (i < ids.size()) {
253 body(BuilderType::create(i, lid), reducer_args...);
255 KernelReducerHelper::applyReducerArgs(x, reducer_args...);
257 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
259 using BuilderType = TraitsType::BuilderType;
261 auto privatizer = privatize(func);
262 auto& body = privatizer.privateCopy();
264 Int32 i =
static_cast<Int32
>(x);
265 if (i < ids.size()) {
267 body(BuilderType::create(i, lid));
277template<
typename Lambda>
280 auto privatizer = privatize(func);
281 auto& body = privatizer.privateCopy();
283 for( Int32 i=0; i<size; ++i ){
293template<
typename Lambda,
typename... LambdaArgs>
294inline void invalidKernel(Lambda&,
const LambdaArgs&...)
299template<
typename Lambda,
typename... LambdaArgs>
319template <
typename CudaKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
321 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
323#if defined(ARCANE_COMPILING_CUDA)
325 cudaStream_t* s =
reinterpret_cast<cudaStream_t*
>(launch_info._internalStreamImpl());
327 kernel<<<b, t, 0, *s>>>(args, func, other_args...);
329 ARCANE_UNUSED(launch_info);
330 ARCANE_UNUSED(kernel);
333 ARCANE_FATAL_NO_CUDA_COMPILATION();
346template <
typename HipKernel,
typename Lambda,
typename LambdaArgs,
typename... RemainingArgs>
void
348 const LambdaArgs& args, [[maybe_unused]]
const RemainingArgs&... other_args)
350#if defined(ARCANE_COMPILING_HIP)
352 hipStream_t* s =
reinterpret_cast<hipStream_t*
>(launch_info._internalStreamImpl());
353 hipLaunchKernelGGL(kernel, b, t, 0, *s, args, func, other_args...);
355 ARCANE_UNUSED(launch_info);
356 ARCANE_UNUSED(kernel);
359 ARCANE_FATAL_NO_HIP_COMPILATION();
372template <
typename SyclKernel,
typename Lambda,
typename LambdaArgs,
typename... ReducerArgs>
374 const LambdaArgs& args, [[maybe_unused]]
const ReducerArgs&... reducer_args)
376#if defined(ARCANE_COMPILING_SYCL)
377 sycl::queue* s =
reinterpret_cast<sycl::queue*
>(launch_info._internalStreamImpl());
379 if constexpr (
sizeof...(ReducerArgs) > 0) {
381 sycl::nd_range<1> loop_size(b * t, t);
382 event = s->parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, reducer_args...); });
386 event = s->parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
388 launch_info._addSyclEvent(&event);
390 ARCANE_UNUSED(launch_info);
391 ARCANE_UNUSED(kernel);
394 ARCANE_FATAL_NO_SYCL_COMPILATION();
406#define ARCANE_MACRO_PARENS ()
412#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
413#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
414#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
416#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
418 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
420#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
436#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
437 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation des réductions.
static ARCCORE_DEVICE void applyReducerArgs(Int32 index, ReducerArgs &... reducer_args)
Applique les fonctors des arguments additionnels.
Int64 totalLoopSize() const
Taille totale de la boucle.
ThreadBlockInfo threadBlockInfo() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
Vue d'un tableau d'éléments de type T.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
Int32 Integer
Type représentant un entier.
std::int32_t Int32
Type entier signé sur 32 bits.