12#ifndef ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H
13#define ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H
17#include "AcceleratorGlobal.h"
18#include "arccore/common/SequentialFor.h"
19#include "arccore/common/StridedLoopRanges.h"
20#include "arccore/common/accelerator/RunCommand.h"
21#include "arccore/concurrency/ParallelFor.h"
23#include "arccore/accelerator/WorkGroupLoopRange.h"
24#include "arccore/accelerator/CooperativeWorkGroupLoopRange.h"
25#include "arccore/accelerator/KernelLauncher.h"
30namespace Arcane::Accelerator::Impl
40template <
typename IndexType_>
41class HostLaunchLoopRangeBase
45 using IndexType = IndexType_;
49 ARCCORE_ACCELERATOR_EXPORT
50 HostLaunchLoopRangeBase(IndexType total_size,
Int32 nb_group, IndexType block_size);
55 constexpr IndexType
nbElement()
const {
return m_total_size; }
57 constexpr IndexType
blockSize()
const {
return m_block_size; }
65 return ((i + 1) != m_nb_block) ? m_block_size : m_last_block_size;
81 IndexType m_total_size = 0;
82 IndexType m_block_size = 0;
83 IndexType m_last_block_size = 0;
90template <
typename WorkGroupLoopRangeType_>
91class HostLaunchLoopRange
92:
public HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>
96 using WorkGroupLoopRangeType = WorkGroupLoopRangeType_;
97 using IndexType =
typename WorkGroupLoopRangeType_::IndexType;
98 using BaseClass = HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>;
102 explicit HostLaunchLoopRange(
const WorkGroupLoopRangeType& bounds)
103 : BaseClass(bounds.nbElement(), bounds.nbBlock(), bounds.blockSize())
115#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)
131#if defined(ARCCORE_COMPILING_SYCL)
147#if defined(ARCCORE_COMPILING_SYCL)
151template <
typename IndexType_>
153:
public std::true_type
158template <
typename IndexType_>
160:
public std::true_type
178 template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
static void
180 const Lambda& func, RemainingArgs... remaining_args)
182 using LoopIndexType = LoopBoundType::LoopIndexType;
185 Int32 loop_index = begin_index * group_size;
186 for (
Int32 i = begin_index; i < (begin_index + nb_loop); ++i) {
192 func(li, remaining_args...);
193 loop_index += group_size;
203#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)
206template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs> __global__
static void
207doHierarchicalLaunchCudaHip(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
209 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
213 if (i < bounds.nbOriginalElement()) {
214 func(WorkGroupLoopContextBuilder::build(bounds.originalLoop()), remaining_args...);
221#if defined(ARCCORE_COMPILING_SYCL)
223template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
224class doHierarchicalLaunchSycl
228 void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,
229 LoopBoundType bounds, Lambda func,
230 RemainingArgs... remaining_args)
const
232 Int32 i =
static_cast<Int32
>(x.get_global_id(0));
233 SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);
235 if (i < bounds.nbOriginalElement()) {
236 func(WorkGroupLoopContextBuilder::build(bounds.originalLoop(), x), remaining_args...);
238 SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);
260template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
void
261_doHierarchicalLaunch(RunCommand& command, LoopBoundType bounds,
262 const Lambda& func,
const RemainingArgs&... other_args)
264 Int64 nb_orig_element = bounds.nbElement();
265 if (nb_orig_element == 0)
271 if ((bounds.blockSize() == 0) || bounds.isCooperativeLaunch())
272 bounds.setBlockSize(command);
274 TrueLoopBoundType bounds2(bounds);
276 command.addNbThreadPerBlock(bounds.blockSize());
277 bounds2.setNbStride(command.nbStride());
282 Impl::RunCommandLaunchInfo launch_info(command, bounds2.strideValue(), bounds.isCooperativeLaunch());
283 launch_info.beginExecute();
284 switch (exec_policy) {
286 ARCCORE_KERNEL_CUDA_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),
287 launch_info, func, bounds2, other_args...);
290 ARCCORE_KERNEL_HIP_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),
291 launch_info, func, bounds2, other_args...);
294 ARCCORE_KERNEL_SYCL_FUNC((Impl::doHierarchicalLaunchSycl<TrueLoopBoundType, Lambda, RemainingArgs...>{}),
295 launch_info, func, bounds2, other_args...);
298 HostLoopBoundType host_bounds(bounds);
299 arccoreSequentialFor(host_bounds, func, other_args...);
302 HostLoopBoundType host_bounds(bounds);
303 arccoreParallelFor(host_bounds, launch_info.loopRunInfo(), func, other_args...);
306 ARCCORE_FATAL(
"Invalid execution policy '{0}'", exec_policy);
308 launch_info.endExecute();
317template <
typename LoopBoundType,
typename... RemainingArgs>
318class ExtendedLaunchRunCommand
322 ExtendedLaunchRunCommand(
RunCommand& command,
const LoopBoundType& bounds)
327 ExtendedLaunchRunCommand(
RunCommand& command,
const LoopBoundType& bounds,
const std::tuple<RemainingArgs...>& args)
330 , m_remaining_args(args)
334 LoopBoundType m_bounds;
335 std::tuple<RemainingArgs...> m_remaining_args;
344template <
typename LoopBoundType,
typename... RemainingArgs>
345class ExtendedLaunchLoop
349 ExtendedLaunchLoop(
const LoopBoundType& bounds, RemainingArgs... args)
351 , m_remaining_args(args...)
354 LoopBoundType m_bounds;
355 std::tuple<RemainingArgs...> m_remaining_args;
361template <
typename LoopBoundType,
typename... RemainingArgs>
auto
362makeLaunch(
const LoopBoundType& bounds, RemainingArgs... args)
371template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
void
372operator<<(ExtendedLaunchRunCommand<LoopBoundType, RemainingArgs...>&& nr,
const Lambda& f)
374 if constexpr (
sizeof...(RemainingArgs) > 0) {
375 std::apply([&](
auto... vs) { _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f, vs...); }, nr.m_remaining_args);
378 _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f);
389template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
void
402template <
typename LoopBoundType,
typename Lambda,
typename... RemainingArgs>
void
404 const Lambda& func,
const RemainingArgs&... remaining_args)
406 Int32 nb_thread = run_info.options().value().maxThread();
408 bounds.setThreadGridSynchronizer(&grid_sync);
409 auto sub_func = [=](
Int32 begin_index,
Int32 nb_loop) {
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
Execution context for a command on a set of blocks.
Iteration range of a loop using cooperative hierarchical parallelism.
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the end of the kernel.
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the beginning of the kernel.
Class to manage the launch of a hierarchical compute kernel.
ThreadGridSynchronizer * m_thread_grid_synchronizer
constexpr IndexType nbActiveItem(Int32 i) const
Number of active items for the i-th block.
ThreadGridSynchronizer * threadGridSynchronizer() const
Grid synchronizer (non-null only in cooperative multi-threading).
constexpr IndexType nbElement() const
Number of elements to process.
constexpr IndexType lastBlockSize() const
Number of elements in the last block.
constexpr IndexType blockSize() const
Block size.
constexpr Int32 nbBlock() const
Number of blocks.
Template to determine if a type used as a loop in kernels always requires sycl::nb_item as an argumen...
Class to manage the decomposition of a loop into multiple parts.
Class to manage grid synchronization in multi-thread;.
Class to execute a portion of the loop sequentially on the host.
static void apply(Int32 begin_index, Int32 nb_loop, HostLaunchLoopRange< LoopBoundType > bounds, const Lambda &func, RemainingArgs... remaining_args)
Applies the functor func on a sequential loop.
Management of an accelerator command.
Execution context of a command on a set of blocks.
constexpr IndexType nbElement() const
Number of elements to process.
Iteration range of a loop using hierarchical parallelism.
static void applyAtEnd(RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the end of the iteration.
static void applyAtBegin(RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the beginning of the iteration.
eExecutionPolicy
Execution policy for a Runner.
@ SYCL
Execution policy using the SYCL environment.
@ HIP
Execution policy using the HIP environment.
@ CUDA
Execution policy using the CUDA environment.
@ Sequential
Sequential execution policy.
@ Thread
Multi-threaded execution policy.
bool isAcceleratorPolicy(eExecutionPolicy exec_policy)
Indicates if exec_policy corresponds to an accelerator.
std::int64_t Int64
Signed integer type of 64 bits.
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applies the lambda function lambda_function concurrently over the iteration interval given by loop_ra...
std::int32_t Int32
Signed integer type of 32 bits.