Arcane  v3.15.0.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
KernelLauncher.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* KernelLauncher.h (C) 2000-2024 */
9/* */
10/* Gestion du lancement des noyaux de calcul sur accélérateur. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
19
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
24
25/*---------------------------------------------------------------------------*/
26/*---------------------------------------------------------------------------*/
27
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
30#else
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
32#endif
33
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
36#else
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
38#endif
39
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
42#else
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
44#endif
45
46/*---------------------------------------------------------------------------*/
47/*---------------------------------------------------------------------------*/
48
49namespace Arcane::Accelerator::impl
50{
51
52template <typename T>
54{
55 using value_type = T;
56 using reference_type = value_type&;
57 value_type m_private_copy;
58
59 ARCCORE_HOST_DEVICE Privatizer(const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() { return m_private_copy; }
61};
62
63template <typename T>
64ARCCORE_HOST_DEVICE auto privatize(const T& item) -> Privatizer<T>
65{
66 return Privatizer<T>{item};
67}
68
69/*---------------------------------------------------------------------------*/
70/*---------------------------------------------------------------------------*/
71/*!
72 * \brief Classe pour appliquer la finalisation pour les arguments supplémentaires.
73 */
75{
76 public:
77
78 //! Applique les fonctors des arguments additionnels.
79 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
80 applyRemainingArgs(Int32 index, RemainingArgs&... remaining_args)
81 {
82 // Applique les réductions
83 (remaining_args._internalExecWorkItem(index), ...);
84 }
85
86#if defined(ARCANE_COMPILING_SYCL)
87 //! Applique les fonctors des arguments additionnels.
88 template <typename... RemainingArgs> static inline ARCCORE_HOST_DEVICE void
89 applyRemainingArgs(sycl::nd_item<1> x, RemainingArgs&... remaining_args)
90 {
91 // Applique les réductions
92 (remaining_args._internalExecWorkItem(x), ...);
93 }
94#endif
95};
96
97/*---------------------------------------------------------------------------*/
98/*---------------------------------------------------------------------------*/
99
100#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
101
102template <typename BuilderType, typename Lambda> __global__ void
103doIndirectGPULambda(SmallSpan<const Int32> ids, Lambda func)
104{
105 using LocalIdType = BuilderType::ValueType;
106
107 auto privatizer = privatize(func);
108 auto& body = privatizer.privateCopy();
109
110 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
111 if (i < ids.size()) {
112 LocalIdType lid(ids[i]);
113 //if (i<10)
114 //printf("CUDA %d lid=%d\n",i,lid.localId());
115 body(BuilderType::create(i, lid));
116 }
117}
118
119template <typename ItemType, typename Lambda> __global__ void
120doDirectGPULambda(Int32 vsize, Lambda func)
121{
122 auto privatizer = privatize(func);
123 auto& body = privatizer.privateCopy();
124
125 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
126 if (i < vsize) {
127 //if (i<10)
128 //printf("CUDA %d lid=%d\n",i,lid.localId());
129 body(i);
130 }
131}
132
133template <typename LoopBoundType, typename Lambda> __global__ void
134doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
135{
136 auto privatizer = privatize(func);
137 auto& body = privatizer.privateCopy();
138
139 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
140 if (i < bounds.nbElement()) {
141 body(bounds.getIndices(i));
142 }
143}
144
145template <typename TraitsType, typename Lambda, typename... RemainingArgs> __global__ void
146doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
147{
148 using BuilderType = TraitsType::BuilderType;
149 using LocalIdType = BuilderType::ValueType;
150
151 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
154
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < ids.size()) {
157 LocalIdType lid(ids[i]);
158 body(BuilderType::create(i, lid), remaining_args...);
159 }
161}
162
163template <typename ItemType, typename Lambda, typename... RemainingArgs> __global__ void
164doDirectGPULambda2(Int32 vsize, Lambda func, RemainingArgs... remaining_args)
165{
166 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
167 auto privatizer = privatize(func);
168 auto& body = privatizer.privateCopy();
169
170 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
171 if (i < vsize) {
172 body(i, remaining_args...);
173 }
175}
176
177template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> __global__ void
178doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
179{
180 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
181 auto privatizer = privatize(func);
182 auto& body = privatizer.privateCopy();
183
184 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
185 if (i < bounds.nbElement()) {
186 body(bounds.getIndices(i), remaining_args...);
187 }
189}
190
191/*---------------------------------------------------------------------------*/
192/*---------------------------------------------------------------------------*/
193
194#endif // ARCANE_COMPILING_CUDA || ARCANE_COMPILING_HIP
195
196/*---------------------------------------------------------------------------*/
197/*---------------------------------------------------------------------------*/
198
199#if defined(ARCANE_COMPILING_SYCL)
200
201//! Boucle N-dimension sans indirection
202template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>
203class DoDirectSYCLLambdaArrayBounds
204{
205 public:
206
207 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args) const
208 {
209 auto privatizer = privatize(func);
210 auto& body = privatizer.privateCopy();
211
212 Int32 i = static_cast<Int32>(x.get_global_id(0));
213 if (i < bounds.nbElement()) {
214 body(bounds.getIndices(i), remaining_args...);
215 }
216 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
217 }
218 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func) const
219 {
220 auto privatizer = privatize(func);
221 auto& body = privatizer.privateCopy();
222
223 Int32 i = static_cast<Int32>(x);
224 if (i < bounds.nbElement()) {
225 body(bounds.getIndices(i));
226 }
227 }
228};
229
230//! Boucle 1D avec indirection
231template <typename TraitsType, typename Lambda, typename... RemainingArgs>
232class DoIndirectSYCLLambda
233{
234 public:
235
236 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args) const
237 {
238 using BuilderType = TraitsType::BuilderType;
239 using LocalIdType = BuilderType::ValueType;
240 auto privatizer = privatize(func);
241 auto& body = privatizer.privateCopy();
242
243 Int32 i = static_cast<Int32>(x.get_global_id(0));
244 if (i < ids.size()) {
245 LocalIdType lid(ids[i]);
246 body(BuilderType::create(i, lid), remaining_args...);
247 }
248 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
249 }
250 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func) const
251 {
252 using BuilderType = TraitsType::BuilderType;
253 using LocalIdType = BuilderType::ValueType;
254 auto privatizer = privatize(func);
255 auto& body = privatizer.privateCopy();
256
257 Int32 i = static_cast<Int32>(x);
258 if (i < ids.size()) {
259 LocalIdType lid(ids[i]);
260 body(BuilderType::create(i, lid));
261 }
262 }
263};
264
265#endif
266
267/*---------------------------------------------------------------------------*/
268/*---------------------------------------------------------------------------*/
269
270template<typename Lambda>
271void doDirectThreadLambda(Integer begin,Integer size,Lambda func)
272{
273 auto privatizer = privatize(func);
274 auto& body = privatizer.privateCopy();
275
276 for( Int32 i=0; i<size; ++i ){
277 func(begin+i);
278 }
279}
280
281/*---------------------------------------------------------------------------*/
282/*---------------------------------------------------------------------------*/
283
284// Fonction vide pour simuler un noyau invalide car non compilé avec
285// le compilateur adéquant. Ne devrait normalement pas être appelé.
286template<typename Lambda,typename... LambdaArgs>
287inline void invalidKernel(Lambda&,const LambdaArgs&...)
288{
289 ARCANE_FATAL("Invalid kernel");
290}
291
292template<typename Lambda,typename... LambdaArgs>
294{
295};
296
297/*---------------------------------------------------------------------------*/
298/*---------------------------------------------------------------------------*/
299
300/*!
301 * \brief Fonction générique pour exécuter un kernel CUDA.
302 *
303 * \param kernel noyau CUDA
304 * \param func fonction à exécuter par le noyau
305 * \param args arguments de la fonction lambda
306 *
307 * TODO: Tester si Lambda est bien une fonction, le SFINAE étant peu lisible :
308 * typename std::enable_if_t<std::is_function_v<std::decay_t<Lambda> > >* = nullptr
309 * attendons les concepts c++20 (requires)
310 *
311 */
312template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
313_applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
314 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
315{
316#if defined(ARCANE_COMPILING_CUDA)
317 Int32 wanted_shared_memory = 0;
318 auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
319 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
320 // TODO: utiliser cudaLaunchKernel() à la place.
321 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s>>>(args, func, other_args...);
322#else
323 ARCANE_UNUSED(launch_info);
324 ARCANE_UNUSED(kernel);
325 ARCANE_UNUSED(func);
326 ARCANE_UNUSED(args);
327 ARCANE_FATAL_NO_CUDA_COMPILATION();
328#endif
329}
330
331/*---------------------------------------------------------------------------*/
332/*---------------------------------------------------------------------------*/
333/*!
334 * \brief Fonction générique pour exécuter un kernel HIP.
335 *
336 * \param kernel noyau HIP
337 * \param func fonction à exécuter par le noyau
338 * \param args arguments de la fonction lambda
339 */
340template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
341_applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
342 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
343{
344#if defined(ARCANE_COMPILING_HIP)
345 Int32 wanted_shared_memory = 0;
346 auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
347 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
348 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
349#else
350 ARCANE_UNUSED(launch_info);
351 ARCANE_UNUSED(kernel);
352 ARCANE_UNUSED(func);
353 ARCANE_UNUSED(args);
354 ARCANE_FATAL_NO_HIP_COMPILATION();
355#endif
356}
357
358/*---------------------------------------------------------------------------*/
359/*---------------------------------------------------------------------------*/
360/*!
361 * \brief Fonction générique pour exécuter un kernel SYCL.
362 *
363 * \param kernel noyau SYCL
364 * \param func fonction à exécuter par le noyau
365 * \param args arguments de la fonction lambda
366 */
367template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
368void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
369 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... remaining_args)
370{
371#if defined(ARCANE_COMPILING_SYCL)
372 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
373 sycl::event event;
374 if constexpr (sizeof...(RemainingArgs) > 0) {
375 auto tbi = launch_info.kernelLaunchArgs();
376 Int32 b = tbi.nbBlockPerGrid();
377 Int32 t = tbi.nbThreadPerBlock();
378 sycl::nd_range<1> loop_size(b * t, t);
379 event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
380 }
381 else {
382 sycl::range<1> loop_size = launch_info.totalLoopSize();
383 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
384 }
385 launch_info._addSyclEvent(&event);
386#else
387 ARCANE_UNUSED(launch_info);
388 ARCANE_UNUSED(kernel);
389 ARCANE_UNUSED(func);
390 ARCANE_UNUSED(args);
391 ARCANE_FATAL_NO_SYCL_COMPILATION();
392#endif
393}
394
395/*---------------------------------------------------------------------------*/
396/*---------------------------------------------------------------------------*/
397
398} // End namespace Arcane::Accelerator::impl
399
400/*---------------------------------------------------------------------------*/
401/*---------------------------------------------------------------------------*/
402
403#define ARCANE_MACRO_PARENS ()
404
405// Les trois macros suivantes permettent de générer récursivement un ensemble
406// de paramètres. Si on veut supporter plus de paramètres, on peut ajouter
407// des appels à la macro suivante dans chaque macro.
408// Plus d'info ici: https://stackoverflow.com/questions/70238923/how-to-expand-a-recursive-macro-via-va-opt-in-a-nested-context
409#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
410#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
411#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
412
413#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
414 , decltype(a1)& a1 \
415 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
416
417#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
418
419/*
420 * \brief Macro pour générer les arguments de la lambda.
421 *
422 * Cette macro est interne à Arcane et ne doit pas être utilisée en dehors de Arcane.
423 *
424 * Cette macro permet de générer pour chaque argument \a arg une valeur `decltype(arg)& arg`.
425 *
426 * Par exemple:
427 * \code
428 * ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(value1,value2)
429 * // Cela génère le code suivant:
430 * , decltype(value1)&, decltype(value2)&
431 * \encode
432 */
433#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
434 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
435
436
437/*---------------------------------------------------------------------------*/
438/*---------------------------------------------------------------------------*/
439
440#endif
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgs(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
Vue d'un tableau d'éléments de type T.
Definition Span.h:670
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
Int32 Integer
Type représentant un entier.
std::int32_t Int32
Type entier signé sur 32 bits.