12#ifndef ARCANE_ACCELERATOR_RUNQUEUEINTERNAL_H
13#define ARCANE_ACCELERATOR_RUNQUEUEINTERNAL_H
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
20#include "arcane/accelerator/AcceleratorGlobal.h"
21#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#if defined(ARCANE_COMPILING_HIP)
24#include <hip/hip_runtime.h>
26#if defined(ARCANE_COMPILING_SYCL)
27#include <sycl/sycl.hpp>
35#if defined(ARCANE_COMPILING_CUDA)
36#define ARCANE_KERNEL_CUDA_FUNC(a) a
38#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
41#if defined(ARCANE_COMPILING_HIP)
42#define ARCANE_KERNEL_HIP_FUNC(a) a
44#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
47#if defined(ARCANE_COMPILING_SYCL)
48#define ARCANE_KERNEL_SYCL_FUNC(a) a
50#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
56namespace Arcane::Accelerator::impl
63 using reference_type = value_type&;
64 value_type m_private_copy;
66 ARCCORE_HOST_DEVICE
Privatizer(
const T& o) : m_private_copy{o} {}
67 ARCCORE_HOST_DEVICE reference_type privateCopy() {
return m_private_copy; }
71ARCCORE_HOST_DEVICE
auto privatize(
const T& item) ->
Privatizer<T>
86 template <
typename...
ReducerArgs>
static inline ARCCORE_DEVICE
void
93#if defined(ARCANE_COMPILING_SYCL)
95 template <
typename...
ReducerArgs>
static inline ARCCORE_HOST_DEVICE
void
107#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
109template <
typename BuilderType,
typename Lambda>
__global__ void
118 if (i < ids.size()) {
122 body(BuilderType::create(i,
lid));
126template <
typename ItemType,
typename Lambda> __global__
void
127doDirectGPULambda(
Int32 vsize, Lambda func)
129 auto privatizer = privatize(func);
130 auto& body = privatizer.privateCopy();
132 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
140template <
typename LoopBoundType,
typename Lambda> __global__
void
141doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
143 auto privatizer = privatize(func);
144 auto& body = privatizer.privateCopy();
146 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
147 if (i < bounds.nbElement()) {
148 body(bounds.getIndices(i));
152template <
typename TraitsType,
typename Lambda,
typename... ReducerArgs> __global__
void
153doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, ReducerArgs... reducer_args)
155 using BuilderType = TraitsType::BuilderType;
159 auto privatizer = privatize(func);
160 auto& body = privatizer.privateCopy();
162 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
163 if (i < ids.size()) {
165 body(BuilderType::create(i, lid), reducer_args...);
170template <
typename ItemType,
typename Lambda,
typename... ReducerArgs> __global__
void
171doDirectGPULambda2(
Int32 vsize, Lambda func, ReducerArgs... reducer_args)
174 auto privatizer = privatize(func);
175 auto& body = privatizer.privateCopy();
177 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
179 body(i, reducer_args...);
184template <
typename LoopBoundType,
typename Lambda,
typename... ReducerArgs> __global__
void
185doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, ReducerArgs... reducer_args)
188 auto privatizer = privatize(func);
189 auto& body = privatizer.privateCopy();
191 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
192 if (i < bounds.nbElement()) {
193 body(bounds.getIndices(i), reducer_args...);
206#if defined(ARCANE_COMPILING_SYCL)
209template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
210class DoDirectSYCLLambdaArrayBounds
214 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... reducer_args)
const
216 auto privatizer = privatize(func);
217 auto& body = privatizer.privateCopy();
219 Int32 i =
static_cast<Int32>(x.get_global_id(0));
220 if (i < bounds.nbElement()) {
221 body(bounds.getIndices(i), reducer_args...);
223 KernelReducerHelper::applyReducerArgs(x, reducer_args...);
225 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
227 auto privatizer = privatize(func);
228 auto& body = privatizer.privateCopy();
231 if (i < bounds.nbElement()) {
232 body(bounds.getIndices(i));
238template <
typename TraitsType,
typename Lambda,
typename... ReducerArgs>
239class DoIndirectSYCLLambda
243 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, ReducerArgs... reducer_args)
const
245 using BuilderType = TraitsType::BuilderType;
247 auto privatizer = privatize(func);
248 auto& body = privatizer.privateCopy();
250 Int32 i =
static_cast<Int32>(x.get_global_id(0));
251 if (i < ids.size()) {
253 body(BuilderType::create(i, lid), reducer_args...);
255 KernelReducerHelper::applyReducerArgs(x, reducer_args...);
257 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func)
const
259 using BuilderType = TraitsType::BuilderType;
261 auto privatizer = privatize(func);
262 auto& body = privatizer.privateCopy();
265 if (i < ids.size()) {
267 body(BuilderType::create(i, lid));
277template<
typename Lambda>
280 auto privatizer = privatize(func);
281 auto& body = privatizer.privateCopy();
283 for(
Int32 i=0; i<size; ++i ){
293template<
typename Lambda,
typename... LambdaArgs>
294inline void invalidKernel(Lambda&,
const LambdaArgs&...)
299template<
typename Lambda,
typename... LambdaArgs>
323#if defined(ARCANE_COMPILING_CUDA)
330 ARCANE_UNUSED(kernel);
333 ARCANE_FATAL_NO_CUDA_COMPILATION();
350#if defined(ARCANE_COMPILING_HIP)
356 ARCANE_UNUSED(kernel);
359 ARCANE_FATAL_NO_HIP_COMPILATION();
376#if defined(ARCANE_COMPILING_SYCL)
377 sycl::queue* s =
reinterpret_cast<sycl::queue*
>(
launch_info._internalStreamImpl());
379 if constexpr (
sizeof...(ReducerArgs) > 0) {
386 event = s->parallel_for(
loop_size, [=](sycl::id<1> i) { kernel(i, args,
func); });
391 ARCANE_UNUSED(kernel);
394 ARCANE_FATAL_NO_SYCL_COMPILATION();
406#define ARCANE_MACRO_PARENS ()
412#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
413#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
414#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
416#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
418 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
420#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
436#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
437 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation des réductions.
static ARCCORE_DEVICE void applyReducerArgs(Int32 index, ReducerArgs &... reducer_args)
Applique les fonctors des arguments additionnels.
Object temporaire pour conserver les informations d'exécution d'une commande et regrouper les tests.
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
Int32 Integer
Type représentant un entier.
std::int32_t Int32
Type entier signé sur 32 bits.