12#ifndef ARCANE_ACCELERATOR_RUNCOMMANDLOOP_H
13#define ARCANE_ACCELERATOR_RUNCOMMANDLOOP_H
20#include "arcane/accelerator/core/RunCommand.h"
21#include "arcane/accelerator/KernelLauncher.h"
29template <
int N,
typename IndexType_>
30constexpr ARCCORE_HOST_DEVICE SimpleForLoopRanges<N, IndexType_>::LoopIndexType
33 return bounds.getIndices(i);
36template <
int N,
typename IndexType_>
37constexpr ARCCORE_HOST_DEVICE ComplexForLoopRanges<N, IndexType_>::LoopIndexType
40 return bounds.getIndices(i);
43#if defined(ARCANE_COMPILING_SYCL)
45template <
int N,
typename IndexType_>
46SimpleForLoopRanges<N, IndexType_>::LoopIndexType
49 return bounds.getIndices(
static_cast<Int32>(x.get_global_id(0)));
52template <
int N,
typename IndexType_>
53ComplexForLoopRanges<N, IndexType_>::LoopIndexType
56 return bounds.getIndices(
static_cast<Int32>(x.get_global_id(0)));
59template <
int N,
typename IndexType_>
60SimpleForLoopRanges<N, IndexType_>::LoopIndexType
63 return bounds.getIndices(
static_cast<Int32>(x));
66template <
int N,
typename IndexType_>
67ComplexForLoopRanges<N, IndexType_>::LoopIndexType
70 return bounds.getIndices(
static_cast<Int32>(x));
83namespace Arcane::Accelerator::Impl
90#if defined(ARCANE_COMPILING_CUDA_OR_HIP)
92template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs> __global__
void
93doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
96 auto privatizer = privatize(func);
97 auto& body = privatizer.privateCopy();
99 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
101 using namespace Arcane::Accelerator::Impl;
104 if (i < bounds.nbElement()) {
105 body(arcaneGetLoopIndexCudaHip(bounds, i), remaining_args...);
115#if defined(ARCANE_COMPILING_SYCL)
118template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
119class DoDirectSYCLLambdaArrayBounds
123 void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,
124 LoopBoundType bounds, Lambda func,
125 RemainingArgs... remaining_args)
const
127 auto privatizer = privatize(func);
128 auto& body = privatizer.privateCopy();
129 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
130 SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);
131 if (i < bounds.nbElement()) {
133 body(arcaneGetLoopIndexSycl(bounds, x), remaining_args...);
135 SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);
137 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func)
const
139 auto privatizer = privatize(func);
140 auto& body = privatizer.privateCopy();
142 Int32 i =
static_cast<Int32
>(x);
143 if (i < bounds.nbElement()) {
144 body(arcaneGetLoopIndexSycl(bounds, i));
159namespace Arcane::Accelerator::impl
177template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
void
179 const Lambda& func,
const RemainingArgs&... other_args)
181 Int64 vsize = bounds.nbElement();
184 Impl::RunCommandLaunchInfo launch_info(command, vsize);
185 const eExecutionPolicy exec_policy = launch_info.executionPolicy();
186 launch_info.beginExecute();
187 switch (exec_policy) {
188 case eExecutionPolicy::CUDA:
189 _applyKernelCUDA(launch_info, ARCANE_KERNEL_CUDA_FUNC(Impl::doDirectGPULambdaArrayBounds2) < LoopBoundType, Lambda, RemainingArgs... >, func, bounds, other_args...);
191 case eExecutionPolicy::HIP:
192 _applyKernelHIP(launch_info, ARCANE_KERNEL_HIP_FUNC(Impl::doDirectGPULambdaArrayBounds2) < LoopBoundType, Lambda, RemainingArgs... >, func, bounds, other_args...);
194 case eExecutionPolicy::SYCL:
195 _applyKernelSYCL(launch_info, ARCANE_KERNEL_SYCL_FUNC(Impl::DoDirectSYCLLambdaArrayBounds) < LoopBoundType, Lambda, RemainingArgs... > {}, func, bounds, other_args...);
197 case eExecutionPolicy::Sequential:
198 arcaneSequentialFor(bounds, func, other_args...);
200 case eExecutionPolicy::Thread:
201 arccoreParallelFor(bounds, launch_info.loopRunInfo(), func, other_args...);
204 ARCANE_FATAL(
"Invalid execution policy '{0}'", exec_policy);
206 launch_info.endExecute();
217template <
typename LoopBoundType,
typename... RemainingArgs>
218class ArrayBoundRunCommand
222 ArrayBoundRunCommand(RunCommand& command,
const LoopBoundType& bounds)
227 ArrayBoundRunCommand(RunCommand& command,
const LoopBoundType& bounds,
const std::tuple<RemainingArgs...>& args)
230 , m_remaining_args(args)
233 RunCommand& m_command;
234 LoopBoundType m_bounds;
235 std::tuple<RemainingArgs...> m_remaining_args;
243template <
typename LoopBoundType,
typename... RemainingArgs>
244class ExtendedArrayBoundLoop
248 ExtendedArrayBoundLoop(
const LoopBoundType& bounds, RemainingArgs... args)
250 , m_remaining_args(args...)
253 LoopBoundType m_bounds;
254 std::tuple<RemainingArgs...> m_remaining_args;
257template <
typename LoopBoundType,
typename... RemainingArgs>
auto
258makeExtendedArrayBoundLoop(
const LoopBoundType& bounds, RemainingArgs... args)
264template <
typename LoopBoundType,
typename... RemainingArgs>
auto
265makeExtendedLoop(
const LoopBoundType& bounds, RemainingArgs... args)
266-> ExtendedArrayBoundLoop<LoopBoundType, RemainingArgs...>
268 return ExtendedArrayBoundLoop<LoopBoundType, RemainingArgs...>(bounds, args...);
279template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
void
281 const Lambda& func,
const std::tuple<RemainingArgs...>& other_args)
283 std::apply([&](
auto... vs) {
_applyGenericLoop(command, bounds, func, vs...); }, other_args);
290template <
typename LoopBoundType,
typename Lambda>
void
291runGeneric(RunCommand& command,
const LoopBoundType& bounds,
const Lambda& func)
298template <
typename ExtentType,
typename Lambda>
void
309namespace Arcane::Accelerator
317template <
typename ExtentType,
typename Lambda>
void
328template <
int N,
typename Lambda>
void
339template <
int N,
typename Lambda>
void
348template <
typename ExtentType>
auto
352 return { command, bounds };
355template <
typename LoopBoundType,
typename... RemainingArgs>
auto
359 return { command, ex_loop.m_bounds, ex_loop.m_remaining_args };
363operator<<(RunCommand& command,
const SimpleForLoopRanges<N, Int32>& bounds)
365 return { command, bounds };
369operator<<(RunCommand& command,
const ComplexForLoopRanges<N, Int32>& bounds)
371 return { command, bounds };
382namespace Arcane::Accelerator::impl
388template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
391 if constexpr (
sizeof...(RemainingArgs) > 0) {
392 runExtended(nr.m_command, nr.m_bounds, f, nr.m_remaining_args);
408#define RUNCOMMAND_LOOP(iter_name, bounds, ...) \
409 A_FUNCINFO << ::Arcane::Accelerator::impl::makeExtendedLoop(bounds __VA_OPT__(, __VA_ARGS__)) \
410 << [=] ARCCORE_HOST_DEVICE(typename decltype(bounds)::LoopIndexType iter_name __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(__VA_ARGS__)))
413#define RUNCOMMAND_LOOPN(iter_name, N, ...) \
414 A_FUNCINFO << Arcane::ArrayBounds<typename Arcane::MDDimType<N>::DimType>(__VA_ARGS__) << [=] ARCCORE_HOST_DEVICE(Arcane::MDIndex<N> iter_name)
417#define RUNCOMMAND_LOOP2(iter_name, x1, x2) \
418 A_FUNCINFO << Arcane::ArrayBounds<MDDim2>(x1, x2) << [=] ARCCORE_HOST_DEVICE(Arcane::MDIndex<2> iter_name)
421#define RUNCOMMAND_LOOP3(iter_name, x1, x2, x3) \
422 A_FUNCINFO << Arcane::ArrayBounds<MDDim3>(x1, x2, x3) << [=] ARCCORE_HOST_DEVICE(Arcane::MDIndex<3> iter_name)
425#define RUNCOMMAND_LOOP4(iter_name, x1, x2, x3, x4) \
426 A_FUNCINFO << Arcane::ArrayBounds<MDDim4>(x1, x2, x3, x4) << [=] ARCCORE_HOST_DEVICE(Arcane::MDIndex<4> iter_name)
436#define RUNCOMMAND_LOOP1(iter_name, x1, ...) \
437 A_FUNCINFO << ::Arcane::Accelerator::impl::makeExtendedArrayBoundLoop(::Arcane::SimpleForLoopRanges<1>(x1) __VA_OPT__(, __VA_ARGS__)) \
438 << [=] ARCCORE_HOST_DEVICE(Arcane::MDIndex<1> iter_name __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(__VA_ARGS__)))
443#define RUNCOMMAND_SINGLE(...) \
444 A_FUNCINFO << ::Arcane::Accelerator::impl::makeExtendedArrayBoundLoop(::Arcane::SimpleForLoopRanges<1>(1) __VA_OPT__(, __VA_ARGS__)) \
445 << [=] ARCCORE_HOST_DEVICE(Arcane::MDIndex<1> __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(__VA_ARGS__)))
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classes, Types et macros pour gérer la concurrence.
void runGeneric(RunCommand &command, const LoopBoundType &bounds, const Lambda &func)
Applique la lambda func sur l'intervalle d'itération donnée par bounds.
void _applyGenericLoop(RunCommand &command, LoopBoundType bounds, const Lambda &func, const RemainingArgs &... other_args)
Applique la lambda func sur une boucle bounds.
void runExtended(RunCommand &command, LoopBoundType bounds, const Lambda &func, const std::tuple< RemainingArgs... > &other_args)
Applique la lambda func sur l'intervalle d'itération donnée par bounds.
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
Classe pour conserver les arguments d'une RunCommand.
Classe pour gérer les paramètres supplémentaires des commandes.
Interval d'itération simple.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
std::ostream & operator<<(std::ostream &ostr, eItemKind item_kind)
Opérateur de sortie sur un flot.
std::int32_t Int32
Type entier signé sur 32 bits.