Arcane  4.1.12.0
User documentation
Loading...
Searching...
No Matches
RunCommandLaunchInfo.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* RunCommandLaunchInfo.cc (C) 2000-2026 */
9/* */
10/* Information for running a 'RunCommand'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/common/accelerator/RunCommandLaunchInfo.h"
15
16#include "arccore/base/FatalErrorException.h"
17#include "arccore/base/CheckedConvert.h"
18#include "arccore/base/ConcurrencyBase.h"
19
20#include "arccore/common/accelerator/KernelLaunchArgs.h"
21#include "arccore/common/accelerator/RunCommand.h"
22#include "arccore/common/accelerator/NativeStream.h"
23#include "arccore/common/accelerator/internal/RunQueueImpl.h"
24#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
25#include "arccore/common/accelerator/internal/RunCommandImpl.h"
26
27/*---------------------------------------------------------------------------*/
28/*---------------------------------------------------------------------------*/
29
30namespace Arcane::Accelerator::Impl
31{
32
33/*---------------------------------------------------------------------------*/
34/*---------------------------------------------------------------------------*/
35
36void RunCommandLaunchInfo::
37_init()
38{
39 m_queue_impl = m_command._internalQueueImpl();
40 m_exec_policy = m_queue_impl->executionPolicy();
41 // Kernel launch information calculation is only useful on accelerator
42 if (isAcceleratorPolicy(m_exec_policy)) {
43 _computeInitialKernelLaunchArgs();
44 m_command._allocateReduceMemory(m_kernel_launch_args.nbBlockPerGrid());
45 // If reductions are present, we force the barrier at the end of the kernel
46 m_is_forced_need_barrier = m_command.m_p->hasActiveReduction();
47 }
48}
49
50/*---------------------------------------------------------------------------*/
51/*---------------------------------------------------------------------------*/
52
53RunCommandLaunchInfo::
54RunCommandLaunchInfo(RunCommand& command, Int64 total_loop_size)
55: m_command(command)
56, m_total_loop_size(total_loop_size)
57{
58 _init();
59}
60
61/*---------------------------------------------------------------------------*/
62/*---------------------------------------------------------------------------*/
63
64RunCommandLaunchInfo::
65RunCommandLaunchInfo(RunCommand& command, Int64 total_loop_size, bool is_cooperative)
66: m_command(command)
67, m_is_cooperative_launch(is_cooperative)
68, m_total_loop_size(total_loop_size)
69{
70 _init();
71}
72
73/*---------------------------------------------------------------------------*/
74/*---------------------------------------------------------------------------*/
75
76RunCommandLaunchInfo::
77~RunCommandLaunchInfo() noexcept(false)
78{
79 // Notifies the end of kernel launch. Normally, this is already done
80 // unless there was an exception during the computation kernel launch.
81 _doEndKernelLaunch();
82}
83
84/*---------------------------------------------------------------------------*/
85/*---------------------------------------------------------------------------*/
86
87void RunCommandLaunchInfo::
88beginExecute()
89{
90 if (m_has_exec_begun)
91 ARCCORE_FATAL("beginExecute() has already been called");
92 m_has_exec_begun = true;
93 m_command._internalNotifyBeginLaunchKernel();
94 if (m_exec_policy == eExecutionPolicy::Thread)
95 _computeLoopRunInfo();
96}
97
98/*---------------------------------------------------------------------------*/
99/*---------------------------------------------------------------------------*/
100
101/*!
102 * \brief Notifies the end of command launch.
103 *
104 * Note that if the command is asynchronous, its execution may continue
105 * after calling this method.
106 */
107void RunCommandLaunchInfo::
108endExecute()
109{
110 if (!m_has_exec_begun)
111 ARCCORE_FATAL("beginExecute() has to be called before endExecute()");
112 _doEndKernelLaunch();
113}
114
115/*---------------------------------------------------------------------------*/
116/*---------------------------------------------------------------------------*/
117
118void RunCommandLaunchInfo::
119_doEndKernelLaunch()
120{
121 if (m_is_notify_end_kernel_done)
122 return;
123 m_is_notify_end_kernel_done = true;
124 m_command._internalNotifyEndLaunchKernel();
125
126 Impl::RunQueueImpl* q = m_queue_impl;
127 if (!q->isAsync() || m_is_need_barrier || m_is_forced_need_barrier)
128 q->_internalBarrier();
129}
130
131/*---------------------------------------------------------------------------*/
132/*---------------------------------------------------------------------------*/
133
134NativeStream RunCommandLaunchInfo::
135_internalNativeStream()
136{
137 return m_command._internalNativeStream();
138}
139
140/*---------------------------------------------------------------------------*/
141/*---------------------------------------------------------------------------*/
142
143/*!
144 * \brief Calculates the initial value of kernel block/thread/grid
145 * based on \a full_size.
146 */
147void RunCommandLaunchInfo::
148_computeInitialKernelLaunchArgs()
149{
150 int threads_per_block = m_command.nbThreadPerBlock();
151 if (threads_per_block <= 0)
152 threads_per_block = 256;
153 Int64 big_b = (m_total_loop_size + threads_per_block - 1) / threads_per_block;
154 int blocks_per_grid = CheckedConvert::toInt32(big_b);
155 m_kernel_launch_args = KernelLaunchArgs(blocks_per_grid, threads_per_block);
156 m_kernel_launch_args.setSharedMemorySize(m_command._sharedMemory());
157 m_kernel_launch_args.setIsCooperative(m_is_cooperative_launch);
158}
159
160/*---------------------------------------------------------------------------*/
161/*---------------------------------------------------------------------------*/
162
163ParallelLoopOptions RunCommandLaunchInfo::
164computeParallelLoopOptions() const
165{
166 ParallelLoopOptions opt = m_command.parallelLoopOptions();
167 const bool use_dynamic_compute = true;
168 // Calculates a default grain size if it is not specified in
169 // the options. By default, we ensure a number of iterations
170 // equal to 2 times the number of threads used.
171 if (use_dynamic_compute && opt.grainSize() == 0) {
172 Int32 nb_thread = opt.maxThread();
173 if (nb_thread <= 0)
175 if (nb_thread <= 0)
176 nb_thread = 1;
177 Int32 grain_size = static_cast<Int32>((double)m_total_loop_size / (nb_thread * 2.0));
178 opt.setGrainSize(grain_size);
179 }
180 return opt;
181}
182
183/*---------------------------------------------------------------------------*/
184/*---------------------------------------------------------------------------*/
185
186/*!
187 * \brief Calculates the value of m_loop_run_info.
188 *
189 * This is only useful in multi-thread mode.
190 */
191void RunCommandLaunchInfo::
192_computeLoopRunInfo()
193{
194 ForLoopTraceInfo lti(m_command.traceInfo(), m_command.kernelName());
195 m_loop_run_info = ForLoopRunInfo(computeParallelLoopOptions(), lti);
196 m_loop_run_info.setExecStat(m_command._internalCommandExecStat());
197}
198
199/*---------------------------------------------------------------------------*/
200/*---------------------------------------------------------------------------*/
201
202/*!
203 * \brief Determines the kernel configuration.
204 *
205 * The configuration depends on the underlying runtime. For CUDA and ROCM,
206 * it is a number of blocks and threads.
207 *
208 * It is possible to dynamically calculate the optimal values to
209 * maximize occupancy.
210 */
211KernelLaunchArgs RunCommandLaunchInfo::
212_computeKernelLaunchArgs(const void* func) const
213{
214 Impl::IRunnerRuntime* r = m_queue_impl->_internalRuntime();
215
216 return r->computeKernalLaunchArgs(m_kernel_launch_args, func,
217 totalLoopSize());
218}
219
220/*---------------------------------------------------------------------------*/
221/*---------------------------------------------------------------------------*/
222
223void RunCommandLaunchInfo::
224_addSyclEvent(void* sycl_event_ptr)
225{
226 m_command._internalNotifyBeginLaunchKernelSyclEvent(sycl_event_ptr);
227}
228
229/*---------------------------------------------------------------------------*/
230/*---------------------------------------------------------------------------*/
231
232bool RunCommandLaunchInfo::
233_isUseCooperativeLaunch() const
234{
235 // Indicates if cudaLaunchCooperativeKernel() is used
236 return m_is_cooperative_launch;
237}
238/*---------------------------------------------------------------------------*/
239/*---------------------------------------------------------------------------*/
240
241bool RunCommandLaunchInfo::
242_isUseCudaLaunchKernel() const
243{
244 // Indicates if cudaLaunchKernel() is used instead of kernel<<<...>>>.
245 return true;
246}
247
248/*---------------------------------------------------------------------------*/
249/*---------------------------------------------------------------------------*/
250
251void RunCommandLaunchInfo::
252_setIsNeedBarrier(bool v)
253{
254 m_is_need_barrier = v;
255}
256
257/*---------------------------------------------------------------------------*/
258/*---------------------------------------------------------------------------*/
259
260} // namespace Arcane::Accelerator::Impl
261
262/*---------------------------------------------------------------------------*/
263/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
static Int32 maxAllowedThread()
Maximum number of allowed threads for multi-threading.
Loop execution information.
Execution options for a parallel loop in multi-threading.
Integer grainSize() const
Size of an iteration interval.
Int32 maxThread() const
Maximum number of allowed threads.
void setGrainSize(Integer v)
Sets the size (approximate) of an iteration interval.
@ Thread
Multi-threaded execution policy.
bool isAcceleratorPolicy(eExecutionPolicy exec_policy)
Indicates if exec_policy corresponds to an accelerator.
std::int64_t Int64
Signed integer type of 64 bits.
std::int32_t Int32
Signed integer type of 32 bits.