Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
RunCommandLaunchImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* RunCommandLaunchImpl.h (C) 2000-2026 */
9/* */
10/* Implementation of a RunCommand for hierarchical parallelism. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H
13#define ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "AcceleratorGlobal.h"
18#include "arccore/common/SequentialFor.h"
19#include "arccore/common/StridedLoopRanges.h"
20#include "arccore/common/accelerator/RunCommand.h"
21#include "arccore/concurrency/ParallelFor.h"
22
23#include "arccore/accelerator/WorkGroupLoopRange.h"
24#include "arccore/accelerator/CooperativeWorkGroupLoopRange.h"
25#include "arccore/accelerator/KernelLauncher.h"
26
27/*---------------------------------------------------------------------------*/
28/*---------------------------------------------------------------------------*/
29
30namespace Arcane::Accelerator::Impl
31{
32
33/*---------------------------------------------------------------------------*/
34/*---------------------------------------------------------------------------*/
35
40template <typename IndexType_>
41class HostLaunchLoopRangeBase
42{
43 public:
44
45 using IndexType = IndexType_;
46
47 public:
48
49 ARCCORE_ACCELERATOR_EXPORT
50 HostLaunchLoopRangeBase(IndexType total_size, Int32 nb_group, IndexType block_size);
51
52 public:
53
55 constexpr IndexType nbElement() const { return m_total_size; }
57 constexpr IndexType blockSize() const { return m_block_size; }
59 constexpr Int32 nbBlock() const { return m_nb_block; }
61 constexpr IndexType lastBlockSize() const { return m_last_block_size; }
63 constexpr IndexType nbActiveItem(Int32 i) const
64 {
65 return ((i + 1) != m_nb_block) ? m_block_size : m_last_block_size;
66 }
67
72 void setThreadGridSynchronizer(ThreadGridSynchronizer* v)
73 {
75 }
76
77 private:
78
81 IndexType m_total_size = 0;
82 IndexType m_block_size = 0;
83 IndexType m_last_block_size = 0;
84 Int32 m_nb_block = 0;
85};
86
87/*---------------------------------------------------------------------------*/
88/*---------------------------------------------------------------------------*/
89
90template <typename WorkGroupLoopRangeType_>
91class HostLaunchLoopRange
92: public HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>
93{
94 public:
95
96 using WorkGroupLoopRangeType = WorkGroupLoopRangeType_;
97 using IndexType = typename WorkGroupLoopRangeType_::IndexType;
98 using BaseClass = HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>;
99
100 public:
101
102 explicit HostLaunchLoopRange(const WorkGroupLoopRangeType& bounds)
103 : BaseClass(bounds.nbElement(), bounds.nbBlock(), bounds.blockSize())
104 {
105 }
106};
107
108/*---------------------------------------------------------------------------*/
109/*---------------------------------------------------------------------------*/
110
112{
113 public:
114
115#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)
116
117 template <typename IndexType_> static constexpr ARCCORE_HOST_DEVICE WorkGroupLoopContext<IndexType_>
118 build(const WorkGroupLoopRange<IndexType_>& loop_range)
119 {
120 return WorkGroupLoopContext<IndexType_>(loop_range.nbElement());
121 }
122
123 template <typename IndexType_> static constexpr ARCCORE_HOST_DEVICE CooperativeWorkGroupLoopContext<IndexType_>
124 build(const CooperativeWorkGroupLoopRange<IndexType_>& loop_range)
125 {
127 }
128
129#endif
130
131#if defined(ARCCORE_COMPILING_SYCL)
132
133 template <typename IndexType_> static SyclWorkGroupLoopContext<IndexType_>
134 build(const WorkGroupLoopRange<IndexType_>& loop_range, sycl::nd_item<1> id)
135 {
136 return SyclWorkGroupLoopContext<IndexType_>(id, loop_range.nbElement());
137 }
138
139 template <typename IndexType_> static SyclCooperativeWorkGroupLoopContext<IndexType_>
140 build(const CooperativeWorkGroupLoopRange<IndexType_>& loop_range, sycl::nd_item<1> id)
141 {
143 }
144#endif
145};
146
147#if defined(ARCCORE_COMPILING_SYCL)
148
149// To indicate that sycl::nd_item must always be used (and never sycl::id)
150// as an argument with 'WorkGroupLoopRange.
151template <typename IndexType_>
153: public std::true_type
154{
155};
156// To indicate that sycl::nd_item must always be used (and never sycl::id)
157// as an argument with 'CooperativeWorkGroupLoopRange.
158template <typename IndexType_>
159class IsAlwaysUseSyclNdItem<StridedLoopRanges<CooperativeWorkGroupLoopRange<IndexType_>>>
160: public std::true_type
161{
162};
163
164#endif
165
166/*---------------------------------------------------------------------------*/
167/*---------------------------------------------------------------------------*/
168
174{
175 public:
176
178 template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> static void
180 const Lambda& func, RemainingArgs... remaining_args)
181 {
182 using LoopIndexType = LoopBoundType::LoopIndexType;
184 const Int32 group_size = bounds.blockSize();
185 Int32 loop_index = begin_index * group_size;
186 for (Int32 i = begin_index; i < (begin_index + nb_loop); ++i) {
187 // For the last loop iteration, the number of active elements may be
188 // less than the group size if \a total_nb_element is not
189 // a multiple of \a group_size.
190 Int32 nb_active = bounds.nbActiveItem(i);
191 LoopIndexType li(loop_index, i, group_size, nb_active, bounds.nbElement(), bounds.nbBlock(), bounds.threadGridSynchronizer());
192 func(li, remaining_args...);
193 loop_index += group_size;
194 }
195
197 }
198};
199
200/*---------------------------------------------------------------------------*/
201/*---------------------------------------------------------------------------*/
202
203#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)
204
205// We use 'Argument dependent lookup' to find 'arcaneGetLoopIndexCudaHip'
206template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> __global__ static void
207doHierarchicalLaunchCudaHip(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
208{
209 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
210
212 // TODO: check if this test is necessary
213 if (i < bounds.nbOriginalElement()) {
214 func(WorkGroupLoopContextBuilder::build(bounds.originalLoop()), remaining_args...);
215 }
217};
218
219#endif
220
221#if defined(ARCCORE_COMPILING_SYCL)
222
223template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>
224class doHierarchicalLaunchSycl
225{
226 public:
227
228 void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,
229 LoopBoundType bounds, Lambda func,
230 RemainingArgs... remaining_args) const
231 {
232 Int32 i = static_cast<Int32>(x.get_global_id(0));
233 SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);
234 // TODO: check if this test is necessary
235 if (i < bounds.nbOriginalElement()) {
236 func(WorkGroupLoopContextBuilder::build(bounds.originalLoop(), x), remaining_args...);
237 }
238 SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);
239 }
240};
241
242#endif
243
244/*---------------------------------------------------------------------------*/
245/*---------------------------------------------------------------------------*/
246
260template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
261_doHierarchicalLaunch(RunCommand& command, LoopBoundType bounds,
262 const Lambda& func, const RemainingArgs&... other_args)
263{
264 Int64 nb_orig_element = bounds.nbElement();
265 if (nb_orig_element == 0)
266 return;
267 const eExecutionPolicy exec_policy = command.executionPolicy();
268 // In cooperative mode, setBlockSize() must always be called
269 // to ensure that the block size is consistent on the host
270 // (in sequential mode, only one block is needed in this case).
271 if ((bounds.blockSize() == 0) || bounds.isCooperativeLaunch())
272 bounds.setBlockSize(command);
273 using TrueLoopBoundType = StridedLoopRanges<LoopBoundType>;
274 TrueLoopBoundType bounds2(bounds);
275 if (isAcceleratorPolicy(exec_policy)) {
276 command.addNbThreadPerBlock(bounds.blockSize());
277 bounds2.setNbStride(command.nbStride());
278 }
279
280 using HostLoopBoundType = HostLaunchLoopRange<LoopBoundType>;
281
282 Impl::RunCommandLaunchInfo launch_info(command, bounds2.strideValue(), bounds.isCooperativeLaunch());
283 launch_info.beginExecute();
284 switch (exec_policy) {
286 ARCCORE_KERNEL_CUDA_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),
287 launch_info, func, bounds2, other_args...);
288 break;
290 ARCCORE_KERNEL_HIP_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),
291 launch_info, func, bounds2, other_args...);
292 break;
294 ARCCORE_KERNEL_SYCL_FUNC((Impl::doHierarchicalLaunchSycl<TrueLoopBoundType, Lambda, RemainingArgs...>{}),
295 launch_info, func, bounds2, other_args...);
296 break;
298 HostLoopBoundType host_bounds(bounds);
299 arccoreSequentialFor(host_bounds, func, other_args...);
300 } break;
302 HostLoopBoundType host_bounds(bounds);
303 arccoreParallelFor(host_bounds, launch_info.loopRunInfo(), func, other_args...);
304 } break;
305 default:
306 ARCCORE_FATAL("Invalid execution policy '{0}'", exec_policy);
307 }
308 launch_info.endExecute();
309}
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
313
317template <typename LoopBoundType, typename... RemainingArgs>
318class ExtendedLaunchRunCommand
319{
320 public:
321
322 ExtendedLaunchRunCommand(RunCommand& command, const LoopBoundType& bounds)
323 : m_command(command)
324 , m_bounds(bounds)
325 {
326 }
327 ExtendedLaunchRunCommand(RunCommand& command, const LoopBoundType& bounds, const std::tuple<RemainingArgs...>& args)
328 : m_command(command)
329 , m_bounds(bounds)
330 , m_remaining_args(args)
331 {
332 }
333 RunCommand& m_command;
334 LoopBoundType m_bounds;
335 std::tuple<RemainingArgs...> m_remaining_args;
336};
337
338/*---------------------------------------------------------------------------*/
339/*---------------------------------------------------------------------------*/
340
344template <typename LoopBoundType, typename... RemainingArgs>
345class ExtendedLaunchLoop
346{
347 public:
348
349 ExtendedLaunchLoop(const LoopBoundType& bounds, RemainingArgs... args)
350 : m_bounds(bounds)
351 , m_remaining_args(args...)
352 {
353 }
354 LoopBoundType m_bounds;
355 std::tuple<RemainingArgs...> m_remaining_args;
356};
357
358/*---------------------------------------------------------------------------*/
359/*---------------------------------------------------------------------------*/
360
361template <typename LoopBoundType, typename... RemainingArgs> auto
362makeLaunch(const LoopBoundType& bounds, RemainingArgs... args)
363-> ExtendedLaunchLoop<LoopBoundType, RemainingArgs...>
364{
365 return ExtendedLaunchLoop<LoopBoundType, RemainingArgs...>(bounds, args...);
366}
367
368/*---------------------------------------------------------------------------*/
369/*---------------------------------------------------------------------------*/
370
371template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
372operator<<(ExtendedLaunchRunCommand<LoopBoundType, RemainingArgs...>&& nr, const Lambda& f)
373{
374 if constexpr (sizeof...(RemainingArgs) > 0) {
375 std::apply([&](auto... vs) { _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f, vs...); }, nr.m_remaining_args);
376 }
377 else {
378 _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f);
379 }
380}
381
382/*---------------------------------------------------------------------------*/
383/*---------------------------------------------------------------------------*/
384
389template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
390arccoreSequentialFor(HostLaunchLoopRange<LoopBoundType> bounds, const Lambda& func, const RemainingArgs&... remaining_args)
391{
392 WorkGroupSequentialForHelper::apply(0, bounds.nbBlock(), bounds, func, remaining_args...);
393}
394
395/*---------------------------------------------------------------------------*/
396/*---------------------------------------------------------------------------*/
397
402template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void
403arccoreParallelFor(HostLaunchLoopRange<LoopBoundType> bounds, ForLoopRunInfo run_info,
404 const Lambda& func, const RemainingArgs&... remaining_args)
405{
406 Int32 nb_thread = run_info.options().value().maxThread();
407 ThreadGridSynchronizer grid_sync(nb_thread);
408 bounds.setThreadGridSynchronizer(&grid_sync);
409 auto sub_func = [=](Int32 begin_index, Int32 nb_loop) {
410 Impl::WorkGroupSequentialForHelper::apply(begin_index, nb_loop, bounds, func, remaining_args...);
411 };
412 ::Arcane::arccoreParallelFor(0, bounds.nbBlock(), run_info, sub_func);
413}
414
415/*---------------------------------------------------------------------------*/
416/*---------------------------------------------------------------------------*/
417
418} // namespace Arcane::Accelerator::Impl
419
420/*---------------------------------------------------------------------------*/
421/*---------------------------------------------------------------------------*/
422
423#endif
424
425/*---------------------------------------------------------------------------*/
426/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
Execution context for a command on a set of blocks.
Iteration range of a loop using cooperative hierarchical parallelism.
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the end of the kernel.
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the beginning of the kernel.
Class to manage the launch of a hierarchical compute kernel.
constexpr IndexType nbActiveItem(Int32 i) const
Number of active items for the i-th block.
ThreadGridSynchronizer * threadGridSynchronizer() const
Grid synchronizer (non-null only in cooperative multi-threading).
constexpr IndexType nbElement() const
Number of elements to process.
constexpr IndexType lastBlockSize() const
Number of elements in the last block.
constexpr IndexType blockSize() const
Block size.
constexpr Int32 nbBlock() const
Number of blocks.
Template to determine if a type used as a loop in kernels always requires sycl::nb_item as an argumen...
Class to manage the decomposition of a loop into multiple parts.
Class to execute a portion of the loop sequentially on the host.
static void apply(Int32 begin_index, Int32 nb_loop, HostLaunchLoopRange< LoopBoundType > bounds, const Lambda &func, RemainingArgs... remaining_args)
Applies the functor func on a sequential loop.
static void applyAtEnd(RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the end of the iteration.
static void applyAtBegin(RemainingArgs &... remaining_args)
Applies the functors of additional arguments at the beginning of the iteration.
eExecutionPolicy
Execution policy for a Runner.
@ SYCL
Execution policy using the SYCL environment.
@ HIP
Execution policy using the HIP environment.
@ CUDA
Execution policy using the CUDA environment.
@ Thread
Multi-threaded execution policy.
bool isAcceleratorPolicy(eExecutionPolicy exec_policy)
Indicates if exec_policy corresponds to an accelerator.
std::int64_t Int64
Signed integer type of 64 bits.
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applies the lambda function lambda_function concurrently over the iteration interval given by loop_ra...
Definition ParallelFor.h:87
std::int32_t Int32
Signed integer type of 32 bits.