Arcane  4.1.12.0
User documentation
Loading...
Searching...
No Matches
RunCommandLaunchImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* RunCommandLaunchImpl.h (C) 2000-2026 */
9/* */
10/* Implementation of a RunCommand for hierarchical parallelism. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H
13#define ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "AcceleratorGlobal.h"
18#include "arccore/common/SequentialFor.h"
19#include "arccore/common/StridedLoopRanges.h"
20#include "arccore/common/accelerator/RunCommand.h"
21#include "arccore/concurrency/ParallelFor.h"
22
23#include "arccore/accelerator/WorkGroupLoopRange.h"
24#include "arccore/accelerator/CooperativeWorkGroupLoopRange.h"
25#include "arccore/accelerator/KernelLauncher.h"
26
27/*---------------------------------------------------------------------------*/
28/*---------------------------------------------------------------------------*/
29
30namespace Arcane::Accelerator::Impl
31{
32
33/*---------------------------------------------------------------------------*/
34/*---------------------------------------------------------------------------*/
35
36/*!
37 * \brief Information of a loop using hierarchical parallelism
38 * on the host.
39 */
40template <typename IndexType_>
41class HostLaunchLoopRangeBase
42{
43 public:
44
45 using IndexType = IndexType_;
46
47 public:
48
49 ARCCORE_ACCELERATOR_EXPORT
50 HostLaunchLoopRangeBase(IndexType total_size, Int32 nb_group, IndexType block_size);
51
52 public:
53
54 //! Number of elements to process
55 constexpr IndexType nbElement() const { return m_total_size; }
56 //! Block size
57 constexpr IndexType blockSize() const { return m_block_size; }
58 //! Number of blocks
59 constexpr Int32 nbBlock() const { return m_nb_block; }
60 //! Number of elements in the last block
61 constexpr IndexType lastBlockSize() const { return m_last_block_size; }
62 //! Number of active items for the i-th block
63 constexpr IndexType nbActiveItem(Int32 i) const
64 {
65 return ((i + 1) != m_nb_block) ? m_block_size : m_last_block_size;
66 }
67 //! Grid synchronizer (non-null only in cooperative multi-threading)
69 {
70 return m_thread_grid_synchronizer;
71 }
72 void setThreadGridSynchronizer(ThreadGridSynchronizer* v)
73 {
74 m_thread_grid_synchronizer = v;
75 }
76
77 private:
78
79 //! This instance is managed by arcaneParallelFor(HostLaunchLoopRange<>...)
80 ThreadGridSynchronizer* m_thread_grid_synchronizer = nullptr;
81 IndexType m_total_size = 0;
82 IndexType m_block_size = 0;
83 IndexType m_last_block_size = 0;
84 Int32 m_nb_block = 0;
85};
86
87/*---------------------------------------------------------------------------*/
88/*---------------------------------------------------------------------------*/
89
90template <typename WorkGroupLoopRangeType_>
91class HostLaunchLoopRange
92: public HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>
93{
94 public:
95
96 using WorkGroupLoopRangeType = WorkGroupLoopRangeType_;
97 using IndexType = typename WorkGroupLoopRangeType_::IndexType;
98 using BaseClass = HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>;
99
100 public:
101
102 explicit HostLaunchLoopRange(const WorkGroupLoopRangeType& bounds)
103 : BaseClass(bounds.nbElement(), bounds.nbBlock(), bounds.blockSize())
104 {
105 }
106};
107
108/*---------------------------------------------------------------------------*/
109/*---------------------------------------------------------------------------*/
110
112{
113 public:
114
115#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)
116
117 template <typename IndexType_> static constexpr ARCCORE_HOST_DEVICE WorkGroupLoopContext<IndexType_>
118 build(const WorkGroupLoopRange<IndexType_>& loop_range)
119 {
120 return WorkGroupLoopContext<IndexType_>(loop_range.nbElement());
121 }
122
123 template <typename IndexType_> static constexpr ARCCORE_HOST_DEVICE CooperativeWorkGroupLoopContext<IndexType_>
124 build(const CooperativeWorkGroupLoopRange<IndexType_>& loop_range)
125 {
127 }
128
129#endif
130
131#if defined(ARCCORE_COMPILING_SYCL)
132
133 template <typename IndexType_> static SyclWorkGroupLoopContext<IndexType_>
134 build(const WorkGroupLoopRange<IndexType_>& loop_range, sycl::nd_item<1> id)
135 {
136 return SyclWorkGroupLoopContext<IndexType_>(id, loop_range.nbElement());
137 }
138
139 template <typename IndexType_> static SyclCooperativeWorkGroupLoopContext<IndexType_>
140 build(const CooperativeWorkGroupLoopRange<IndexType_>& loop_range, sycl::nd_item<1> id)
141 {
143 }
144#endif
145};
146
147#if defined(ARCCORE_COMPILING_SYCL)
148
149// To indicate that sycl::nd_item must always be used (and never sycl::id)
150// as an argument with 'WorkGroupLoopRange.
151template <typename IndexType_>
153: public std::true_type
154{
155};
156// To indicate that sycl::nd_item must always be used (and never sycl::id)
157// as an argument with 'CooperativeWorkGroupLoopRange.
158template <typename IndexType_>
159class IsAlwaysUseSyclNdItem<StridedLoopRanges<CooperativeWorkGroupLoopRange<IndexType_>>>
160: public std::true_type
161{
162};
163
164#endif
165
166/*---------------------------------------------------------------------------*/
167/*---------------------------------------------------------------------------*/
168
169/*!
170 * \internal
171 * \brief Class to execute a portion of the loop sequentially on the host.
172 */
174{
175 public:
176
177 //! Applies the functor \a func on a sequential loop.
178 template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> static void
180 const Lambda& func, RemainingArgs... remaining_args)
181 {
182 using LoopIndexType = LoopBoundType::LoopIndexType;
184 const Int32 group_size = bounds.blockSize();
185 Int32 loop_index = begin_index * group_size;
186 for (Int32 i = begin_index; i < (begin_index + nb_loop); ++i) {
187 // For the last loop iteration, the number of active elements may be
188 // less than the group size if \a total_nb_element is not
189 // a multiple of \a group_size.
190 Int32 nb_active = bounds.nbActiveItem(i);
191 LoopIndexType li(loop_index, i, group_size, nb_active, bounds.nbElement(), bounds.nbBlock(), bounds.threadGridSynchronizer());
192 func(li, remaining_args...);
193 loop_index += group_size;
194 }
195
197 }
198};
199
200/*---------------------------------------------------------------------------*/
201/*---------------------------------------------------------------------------*/
202
203#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)
204
205// We use 'Argument dependent lookup' to find 'arcaneGetLoopIndexCudaHip'
206template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> __global__ static void
207doHierarchicalLaunchCudaHip(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
208{
209 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
210
212 // TODO: check if this test is necessary
213 if (i < bounds.nbOriginalElement()) {
214 func(WorkGroupLoopContextBuilder::build(bounds.originalLoop()), remaining_args...);
215 }
217};
218
219#endif
220
221#if defined(ARCCORE_COMPILING_SYCL)
222
223template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>
224class doHierarchicalLaunchSycl
225{
226 public:
227
228 void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,
229 LoopBoundType bounds, Lambda func,
230 RemainingArgs... remaining_args) const
231 {
232 Int32 i = static_cast<Int32>(x.get_global_id(0));
233 SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);
234 // TODO: check if this test is necessary
235 if (i < bounds.nbOriginalElement()) {
236 func(WorkGroupLoopContextBuilder::build(bounds.originalLoop(), x), remaining_args...);
237 }
238 SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);
239 }
240};
241
242#endif
243
244/*---------------------------------------------------------------------------*/
245/*---------------------------------------------------------------------------*/
246
247/*!
248 * \brief Applies the lambda \a func on a loop \a bounds.
249 *
250 * The lambda \a func is applied to the \a command.
251 * \a bound is the loop type. Supported types are:
252 *
253 * - WorkGroupLoopRange
254 * - CooperativeWorkGroupLoopRange
255 *
256 * Additional arguments \a other_args are used to support
257 * features such as reductions (ReducerSum2, ReducerMax2, ...)
258 * or local memory management (via LocalMemory).
259 */
260template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
261_doHierarchicalLaunch(RunCommand& command, LoopBoundType bounds,
262 const Lambda& func, const RemainingArgs&... other_args)
263{
264 Int64 nb_orig_element = bounds.nbElement();
265 if (nb_orig_element == 0)
266 return;
267 const eExecutionPolicy exec_policy = command.executionPolicy();
268 // In cooperative mode, setBlockSize() must always be called
269 // to ensure that the block size is consistent on the host
270 // (in sequential mode, only one block is needed in this case).
271 if ((bounds.blockSize() == 0) || bounds.isCooperativeLaunch())
272 bounds.setBlockSize(command);
273 using TrueLoopBoundType = StridedLoopRanges<LoopBoundType>;
274 TrueLoopBoundType bounds2(bounds);
275 if (isAcceleratorPolicy(exec_policy)) {
276 command.addNbThreadPerBlock(bounds.blockSize());
277 bounds2.setNbStride(command.nbStride());
278 }
279
280 using HostLoopBoundType = HostLaunchLoopRange<LoopBoundType>;
281
282 Impl::RunCommandLaunchInfo launch_info(command, bounds2.strideValue(), bounds.isCooperativeLaunch());
283 launch_info.beginExecute();
284 switch (exec_policy) {
286 ARCCORE_KERNEL_CUDA_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),
287 launch_info, func, bounds2, other_args...);
288 break;
290 ARCCORE_KERNEL_HIP_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),
291 launch_info, func, bounds2, other_args...);
292 break;
294 ARCCORE_KERNEL_SYCL_FUNC((Impl::doHierarchicalLaunchSycl<TrueLoopBoundType, Lambda, RemainingArgs...>{}),
295 launch_info, func, bounds2, other_args...);
296 break;
298 HostLoopBoundType host_bounds(bounds);
299 arccoreSequentialFor(host_bounds, func, other_args...);
300 } break;
302 HostLoopBoundType host_bounds(bounds);
303 arccoreParallelFor(host_bounds, launch_info.loopRunInfo(), func, other_args...);
304 } break;
305 default:
306 ARCCORE_FATAL("Invalid execution policy '{0}'", exec_policy);
307 }
308 launch_info.endExecute();
309}
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
313
314/*!
315 * \brief Class to retain the arguments of a RunCommand.
316 */
317template <typename LoopBoundType, typename... RemainingArgs>
318class ExtendedLaunchRunCommand
319{
320 public:
321
322 ExtendedLaunchRunCommand(RunCommand& command, const LoopBoundType& bounds)
323 : m_command(command)
324 , m_bounds(bounds)
325 {
326 }
327 ExtendedLaunchRunCommand(RunCommand& command, const LoopBoundType& bounds, const std::tuple<RemainingArgs...>& args)
328 : m_command(command)
329 , m_bounds(bounds)
330 , m_remaining_args(args)
331 {
332 }
333 RunCommand& m_command;
334 LoopBoundType m_bounds;
335 std::tuple<RemainingArgs...> m_remaining_args;
336};
337
338/*---------------------------------------------------------------------------*/
339/*---------------------------------------------------------------------------*/
340
341/*!
342 * \brief Class to manage the launch of a hierarchical compute kernel.
343 */
344template <typename LoopBoundType, typename... RemainingArgs>
345class ExtendedLaunchLoop
346{
347 public:
348
349 ExtendedLaunchLoop(const LoopBoundType& bounds, RemainingArgs... args)
350 : m_bounds(bounds)
351 , m_remaining_args(args...)
352 {
353 }
354 LoopBoundType m_bounds;
355 std::tuple<RemainingArgs...> m_remaining_args;
356};
357
358/*---------------------------------------------------------------------------*/
359/*---------------------------------------------------------------------------*/
360
361template <typename LoopBoundType, typename... RemainingArgs> auto
362makeLaunch(const LoopBoundType& bounds, RemainingArgs... args)
363-> ExtendedLaunchLoop<LoopBoundType, RemainingArgs...>
364{
365 return ExtendedLaunchLoop<LoopBoundType, RemainingArgs...>(bounds, args...);
366}
367
368/*---------------------------------------------------------------------------*/
369/*---------------------------------------------------------------------------*/
370
371template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
372operator<<(ExtendedLaunchRunCommand<LoopBoundType, RemainingArgs...>&& nr, const Lambda& f)
373{
374 if constexpr (sizeof...(RemainingArgs) > 0) {
375 std::apply([&](auto... vs) { _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f, vs...); }, nr.m_remaining_args);
376 }
377 else {
378 _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f);
379 }
380}
381
382/*---------------------------------------------------------------------------*/
383/*---------------------------------------------------------------------------*/
384
385/*!
386 * \internal
387 * \brief Applies the functor \a func on a sequential loop.
388 */
389template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
390arccoreSequentialFor(HostLaunchLoopRange<LoopBoundType> bounds, const Lambda& func, const RemainingArgs&... remaining_args)
391{
392 WorkGroupSequentialForHelper::apply(0, bounds.nbBlock(), bounds, func, remaining_args...);
393}
394
395/*---------------------------------------------------------------------------*/
396/*---------------------------------------------------------------------------*/
397
398/*!
399 * \internal
400 * \brief Applies the functor \a func on a parallel loop.
401 */
402template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
403arccoreParallelFor(HostLaunchLoopRange<LoopBoundType> bounds, ForLoopRunInfo run_info,
404 const Lambda& func, const RemainingArgs&... remaining_args)
405{
406 Int32 nb_thread = run_info.options().value().maxThread();
407 ThreadGridSynchronizer grid_sync(nb_thread);
408 bounds.setThreadGridSynchronizer(&grid_sync);
409 auto sub_func = [=](Int32 begin_index, Int32 nb_loop) {
410 Impl::WorkGroupSequentialForHelper::apply(begin_index, nb_loop, bounds, func, remaining_args...);
411 };
412 ::Arcane::arccoreParallelFor(0, bounds.nbBlock(), run_info, sub_func);
413}
414
415/*---------------------------------------------------------------------------*/
416/*---------------------------------------------------------------------------*/
417
418} // namespace Arcane::Accelerator::Impl
419
420/*---------------------------------------------------------------------------*/
421/*---------------------------------------------------------------------------*/
422
423#endif
424
425/*---------------------------------------------------------------------------*/
426/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
Execution context for a command on a set of blocks.
Iteration range of a loop using cooperative hierarchical parallelism.
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the end of the kernel.
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the beginning of the kernel.
Class to manage the launch of a hierarchical compute kernel.
constexpr IndexType nbActiveItem(Int32 i) const
Number of active items for the i-th block.
ThreadGridSynchronizer * threadGridSynchronizer() const
Grid synchronizer (non-null only in cooperative multi-threading).
constexpr IndexType nbElement() const
Number of elements to process.
constexpr IndexType lastBlockSize() const
Number of elements in the last block.
constexpr IndexType blockSize() const
Block size.
constexpr Int32 nbBlock() const
Number of blocks.
Template to determine if a type used as a loop in kernels always requires sycl::nb_item as an argumen...
Class to manage the decomposition of a loop into multiple parts.
static void apply(Int32 begin_index, Int32 nb_loop, HostLaunchLoopRange< LoopBoundType > bounds, const Lambda &func, RemainingArgs... remaining_args)
Applies the functor func on a sequential loop.
static void applyAtEnd(RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the end of the iteration.
static void applyAtBegin(RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the beginning of the iteration.
eExecutionPolicy
Execution policy for a Runner.
@ SYCL
Execution policy using the SYCL environment.
@ HIP
Execution policy using the HIP environment.
@ CUDA
Execution policy using the CUDA environment.
@ Thread
Multi-threaded execution policy.
bool isAcceleratorPolicy(eExecutionPolicy exec_policy)
Indicates if exec_policy corresponds to an accelerator.
std::int64_t Int64
Signed integer type of 64 bits.
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applies the lambda function lambda_function concurrently over the iteration interval given by loop_ra...
Definition ParallelFor.h:87
std::int32_t Int32
Signed integer type of 32 bits.