Arcane  v4.1.4.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
RunCommandLaunchInfo.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* RunCommandLaunchInfo.cc (C) 2000-2026 */
9/* */
10/* Informations pour l'exécution d'une 'RunCommand'. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/common/accelerator/RunCommandLaunchInfo.h"
15
16#include "arccore/base/FatalErrorException.h"
17#include "arccore/base/CheckedConvert.h"
18#include "arccore/base/ConcurrencyBase.h"
19
20#include "arccore/common/accelerator/KernelLaunchArgs.h"
21#include "arccore/common/accelerator/RunCommand.h"
22#include "arccore/common/accelerator/NativeStream.h"
23#include "arccore/common/accelerator/internal/RunQueueImpl.h"
24#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
25
26/*---------------------------------------------------------------------------*/
27/*---------------------------------------------------------------------------*/
28
29namespace Arcane::Accelerator::Impl
30{
31
32/*---------------------------------------------------------------------------*/
33/*---------------------------------------------------------------------------*/
34
35void RunCommandLaunchInfo::
36_init()
37{
38 m_queue_impl = m_command._internalQueueImpl();
39 m_exec_policy = m_queue_impl->executionPolicy();
40
41 // Le calcul des informations de kernel n'est utile que sur accélérateur
42 if (isAcceleratorPolicy(m_exec_policy)) {
43 _computeInitialKernelLaunchArgs();
44 m_command._allocateReduceMemory(m_kernel_launch_args.nbBlockPerGrid());
45 }
46}
47
48/*---------------------------------------------------------------------------*/
49/*---------------------------------------------------------------------------*/
50
51RunCommandLaunchInfo::
52RunCommandLaunchInfo(RunCommand& command, Int64 total_loop_size)
53: m_command(command)
54, m_total_loop_size(total_loop_size)
55{
56 _init();
57}
58
59/*---------------------------------------------------------------------------*/
60/*---------------------------------------------------------------------------*/
61
62RunCommandLaunchInfo::
63RunCommandLaunchInfo(RunCommand& command, Int64 total_loop_size, bool is_cooperative)
64: m_command(command)
65, m_total_loop_size(total_loop_size)
66, m_is_cooperative_launch(is_cooperative)
67{
68 _init();
69}
70
71/*---------------------------------------------------------------------------*/
72/*---------------------------------------------------------------------------*/
73
74RunCommandLaunchInfo::
75~RunCommandLaunchInfo()
76{
77 // Notifie de la fin de lancement du noyau. Normalement, cela est déjà fait
78 // sauf s'il y a eu une exception pendant le lancement du noyau de calcul.
79 _doEndKernelLaunch();
80}
81
82/*---------------------------------------------------------------------------*/
83/*---------------------------------------------------------------------------*/
84
85void RunCommandLaunchInfo::
86beginExecute()
87{
88 if (m_has_exec_begun)
89 ARCCORE_FATAL("beginExecute() has already been called");
90 m_has_exec_begun = true;
91 m_command._internalNotifyBeginLaunchKernel();
92 if (m_exec_policy == eExecutionPolicy::Thread)
93 _computeLoopRunInfo();
94}
95
96/*---------------------------------------------------------------------------*/
97/*---------------------------------------------------------------------------*/
98/*!
99 * \brief Notifie de la fin de lancement de la commande.
100 *
101 * A noter que si la commande est asynchrone, son exécution peut continuer
102 * après l'appel à cette méthode.
103 */
104void RunCommandLaunchInfo::
105endExecute()
106{
107 if (!m_has_exec_begun)
108 ARCCORE_FATAL("beginExecute() has to be called before endExecute()");
109 _doEndKernelLaunch();
110}
111
112/*---------------------------------------------------------------------------*/
113/*---------------------------------------------------------------------------*/
114
115void RunCommandLaunchInfo::
116_doEndKernelLaunch()
117{
118 if (m_is_notify_end_kernel_done)
119 return;
120 m_is_notify_end_kernel_done = true;
121 m_command._internalNotifyEndLaunchKernel();
122
123 Impl::RunQueueImpl* q = m_queue_impl;
124 if (!q->isAsync() || m_is_need_barrier)
125 q->_internalBarrier();
126}
127
128/*---------------------------------------------------------------------------*/
129/*---------------------------------------------------------------------------*/
130
131NativeStream RunCommandLaunchInfo::
132_internalNativeStream()
133{
134 return m_command._internalNativeStream();
135}
136
137/*---------------------------------------------------------------------------*/
138/*---------------------------------------------------------------------------*/
139/*!
140 * \brief Calcule la valeur initiale de block/thread/grille du noyau
141 * en fonction de \a full_size.
142 */
143void RunCommandLaunchInfo::
144_computeInitialKernelLaunchArgs()
145{
146 int threads_per_block = m_command.nbThreadPerBlock();
147 if (threads_per_block<=0)
148 threads_per_block = 256;
149 Int64 big_b = (m_total_loop_size + threads_per_block - 1) / threads_per_block;
150 int blocks_per_grid = CheckedConvert::toInt32(big_b);
151 m_kernel_launch_args = KernelLaunchArgs(blocks_per_grid, threads_per_block);
152 m_kernel_launch_args.setSharedMemorySize(m_command._sharedMemory());
153 m_kernel_launch_args.setIsCooperative(m_is_cooperative_launch);
154}
155
156/*---------------------------------------------------------------------------*/
157/*---------------------------------------------------------------------------*/
158
159ParallelLoopOptions RunCommandLaunchInfo::
160computeParallelLoopOptions() const
161{
162 ParallelLoopOptions opt = m_command.parallelLoopOptions();
163 const bool use_dynamic_compute = true;
164 // Calcule une taille de grain par défaut si cela n'est pas renseigné dans
165 // les options. Par défaut on fait en sorte de faire un nombre d'itérations
166 // égale à 2 fois le nombre de threads utilisés.
167 if (use_dynamic_compute && opt.grainSize() == 0) {
168 Int32 nb_thread = opt.maxThread();
169 if (nb_thread <= 0)
171 if (nb_thread <= 0)
172 nb_thread = 1;
173 Int32 grain_size = static_cast<Int32>((double)m_total_loop_size / (nb_thread * 2.0));
174 opt.setGrainSize(grain_size);
175 }
176 return opt;
177}
178
179/*---------------------------------------------------------------------------*/
180/*---------------------------------------------------------------------------*/
181/*!
182 * \brief Calcule la valeur de m_loop_run_info.
183 *
184 * Cela n'est utile qu'en mode multi-thread.
185 */
186void RunCommandLaunchInfo::
187_computeLoopRunInfo()
188{
189 ForLoopTraceInfo lti(m_command.traceInfo(), m_command.kernelName());
190 m_loop_run_info = ForLoopRunInfo(computeParallelLoopOptions(), lti);
191 m_loop_run_info.setExecStat(m_command._internalCommandExecStat());
192}
193
194/*---------------------------------------------------------------------------*/
195/*---------------------------------------------------------------------------*/
196/*!
197 * \brief Détermine la configuration du kernel.
198 *
199 * La configuration est dépendante du runtime sous-jacent. Pour CUDA et ROCM,
200 * il s'agit d'un nombre de blocs et de thread.
201 *
202 * Il est possible de calculer dynamiquement les valeurs optimales pour
203 * maximiser l'occupation.
204 */
205KernelLaunchArgs RunCommandLaunchInfo::
206_computeKernelLaunchArgs(const void* func) const
207{
208 Impl::IRunnerRuntime* r = m_queue_impl->_internalRuntime();
209
210 return r->computeKernalLaunchArgs(m_kernel_launch_args, func,
211 totalLoopSize());
212}
213
214/*---------------------------------------------------------------------------*/
215/*---------------------------------------------------------------------------*/
216
217void RunCommandLaunchInfo::
218_addSyclEvent(void* sycl_event_ptr)
219{
220 m_command._internalNotifyBeginLaunchKernelSyclEvent(sycl_event_ptr);
221}
222
223/*---------------------------------------------------------------------------*/
224/*---------------------------------------------------------------------------*/
225
226bool RunCommandLaunchInfo::
227_isUseCooperativeLaunch() const
228{
229 // Indique si on utilise cudaLaunchCooperativeKernel()
230 return m_is_cooperative_launch;
231}
232/*---------------------------------------------------------------------------*/
233/*---------------------------------------------------------------------------*/
234
235bool RunCommandLaunchInfo::
236_isUseCudaLaunchKernel() const
237{
238 // Indique si on utilise cudaLaunchKernel() au lieu de kernel<<<...>>>.
239 return true;
240}
241
242/*---------------------------------------------------------------------------*/
243/*---------------------------------------------------------------------------*/
244
245void RunCommandLaunchInfo::
246_setIsNeedBarrier(bool v)
247{
248 m_is_need_barrier = v;
249}
250
251/*---------------------------------------------------------------------------*/
252/*---------------------------------------------------------------------------*/
253
254} // End namespace Arcane::Accelerator
255
256/*---------------------------------------------------------------------------*/
257/*---------------------------------------------------------------------------*/
#define ARCCORE_FATAL(...)
Macro envoyant une exception FatalErrorException.
static Int32 maxAllowedThread()
Nombre maximum de threads autorisés pour le multi-threading.
Informations d'exécution d'une boucle.
Informations de trace pour une boucle 'for'.
Options d'exécution d'une boucle parallèle en multi-thread.
Integer grainSize() const
Taille d'un intervalle d'itération.
Int32 maxThread() const
Nombre maximal de threads autorisés.
void setGrainSize(Integer v)
Positionne la taille (approximative) d'un intervalle d'itération.
@ Thread
Politique d'exécution multi-thread.
bool isAcceleratorPolicy(eExecutionPolicy exec_policy)
Indique si exec_policy correspond à un accélérateur.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.