Arcane  v3.16.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
KernelLauncher.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* KernelLauncher.h (C) 2000-2024 */
9/* */
10/* Gestion du lancement des noyaux de calcul sur accélérateur. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_KERNELLAUNCHER_H
13#define ARCANE_ACCELERATOR_KERNELLAUNCHER_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
19
20#include "arcane/accelerator/core/NativeStream.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/RunCommandLaunchInfo.h"
23#include "arcane/accelerator/AcceleratorUtils.h"
24
25/*---------------------------------------------------------------------------*/
26/*---------------------------------------------------------------------------*/
27
28#if defined(ARCANE_COMPILING_CUDA)
29#define ARCANE_KERNEL_CUDA_FUNC(a) a
30#else
31#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
32#endif
33
34#if defined(ARCANE_COMPILING_HIP)
35#define ARCANE_KERNEL_HIP_FUNC(a) a
36#else
37#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
38#endif
39
40#if defined(ARCANE_COMPILING_SYCL)
41#define ARCANE_KERNEL_SYCL_FUNC(a) a
42#else
43#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
44#endif
45
46/*---------------------------------------------------------------------------*/
47/*---------------------------------------------------------------------------*/
48
49namespace Arcane::Accelerator::impl
50{
51
52template <typename T>
53struct Privatizer
54{
55 using value_type = T;
56 using reference_type = value_type&;
57 value_type m_private_copy;
58
59 ARCCORE_HOST_DEVICE Privatizer(const T& o) : m_private_copy{o} {}
60 ARCCORE_HOST_DEVICE reference_type privateCopy() { return m_private_copy; }
61};
62
63template <typename T>
64ARCCORE_HOST_DEVICE auto privatize(const T& item) -> Privatizer<T>
65{
66 return Privatizer<T>{item};
67}
68
69/*---------------------------------------------------------------------------*/
70/*---------------------------------------------------------------------------*/
75{
76 public:
77
79 template <typename... RemainingArgs> static inline ARCCORE_DEVICE void
80 applyRemainingArgs(Int32 index, RemainingArgs&... remaining_args)
81 {
82 // Applique les réductions
83 (remaining_args._internalExecWorkItem(index), ...);
84 }
85
86#if defined(ARCANE_COMPILING_SYCL)
88 template <typename... RemainingArgs> static inline ARCCORE_HOST_DEVICE void
89 applyRemainingArgs(sycl::nd_item<1> x, RemainingArgs&... remaining_args)
90 {
91 // Applique les réductions
92 (remaining_args._internalExecWorkItem(x), ...);
93 }
94#endif
95};
96
97/*---------------------------------------------------------------------------*/
98/*---------------------------------------------------------------------------*/
99
100#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
101
102template <typename BuilderType, typename Lambda> __global__ void
103doIndirectGPULambda(SmallSpan<const Int32> ids, Lambda func)
104{
105 using LocalIdType = BuilderType::ValueType;
106
107 auto privatizer = privatize(func);
108 auto& body = privatizer.privateCopy();
109
110 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
111 if (i < ids.size()) {
112 LocalIdType lid(ids[i]);
113 //if (i<10)
114 //printf("CUDA %d lid=%d\n",i,lid.localId());
115 body(BuilderType::create(i, lid));
116 }
117}
118
119template <typename ItemType, typename Lambda> __global__ void
120doDirectGPULambda(Int32 vsize, Lambda func)
121{
122 auto privatizer = privatize(func);
123 auto& body = privatizer.privateCopy();
124
125 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
126 if (i < vsize) {
127 //if (i<10)
128 //printf("CUDA %d lid=%d\n",i,lid.localId());
129 body(i);
130 }
131}
132
133template <typename LoopBoundType, typename Lambda> __global__ void
134doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
135{
136 auto privatizer = privatize(func);
137 auto& body = privatizer.privateCopy();
138
139 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
140 if (i < bounds.nbElement()) {
141 body(bounds.getIndices(i));
142 }
143}
144
145template <typename TraitsType, typename Lambda, typename... RemainingArgs> __global__ void
146doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args)
147{
148 using BuilderType = TraitsType::BuilderType;
149 using LocalIdType = BuilderType::ValueType;
150
151 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
152 auto privatizer = privatize(func);
153 auto& body = privatizer.privateCopy();
154
155 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
156 if (i < ids.size()) {
157 LocalIdType lid(ids[i]);
158 body(BuilderType::create(i, lid), remaining_args...);
159 }
161}
162
163template <typename ItemType, typename Lambda, typename... RemainingArgs> __global__ void
164doDirectGPULambda2(Int32 vsize, Lambda func, RemainingArgs... remaining_args)
165{
166 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
167 auto privatizer = privatize(func);
168 auto& body = privatizer.privateCopy();
169
170 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
171 if (i < vsize) {
172 body(i, remaining_args...);
173 }
175}
176
177template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> __global__ void
178doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)
179{
180 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
181 auto privatizer = privatize(func);
182 auto& body = privatizer.privateCopy();
183
184 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
185 if (i < bounds.nbElement()) {
186 body(bounds.getIndices(i), remaining_args...);
187 }
189}
190
191/*---------------------------------------------------------------------------*/
192/*---------------------------------------------------------------------------*/
193
194#endif // ARCANE_COMPILING_CUDA || ARCANE_COMPILING_HIP
195
196/*---------------------------------------------------------------------------*/
197/*---------------------------------------------------------------------------*/
198
199#if defined(ARCANE_COMPILING_SYCL)
200
202template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>
203class DoDirectSYCLLambdaArrayBounds
204{
205 public:
206
207 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args) const
208 {
209 auto privatizer = privatize(func);
210 auto& body = privatizer.privateCopy();
211
212 Int32 i = static_cast<Int32>(x.get_global_id(0));
213 if (i < bounds.nbElement()) {
214 body(bounds.getIndices(i), remaining_args...);
215 }
216 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
217 }
218 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func) const
219 {
220 auto privatizer = privatize(func);
221 auto& body = privatizer.privateCopy();
222
223 Int32 i = static_cast<Int32>(x);
224 if (i < bounds.nbElement()) {
225 body(bounds.getIndices(i));
226 }
227 }
228};
229
231template <typename TraitsType, typename Lambda, typename... RemainingArgs>
232class DoIndirectSYCLLambda
233{
234 public:
235
236 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, RemainingArgs... remaining_args) const
237 {
238 using BuilderType = TraitsType::BuilderType;
239 using LocalIdType = BuilderType::ValueType;
240 auto privatizer = privatize(func);
241 auto& body = privatizer.privateCopy();
242
243 Int32 i = static_cast<Int32>(x.get_global_id(0));
244 if (i < ids.size()) {
245 LocalIdType lid(ids[i]);
246 body(BuilderType::create(i, lid), remaining_args...);
247 }
248 KernelRemainingArgsHelper::applyRemainingArgs(x, remaining_args...);
249 }
250 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func) const
251 {
252 using BuilderType = TraitsType::BuilderType;
253 using LocalIdType = BuilderType::ValueType;
254 auto privatizer = privatize(func);
255 auto& body = privatizer.privateCopy();
256
257 Int32 i = static_cast<Int32>(x);
258 if (i < ids.size()) {
259 LocalIdType lid(ids[i]);
260 body(BuilderType::create(i, lid));
261 }
262 }
263};
264
265#endif
266
267/*---------------------------------------------------------------------------*/
268/*---------------------------------------------------------------------------*/
269
270template<typename Lambda>
271void doDirectThreadLambda(Integer begin,Integer size,Lambda func)
272{
273 auto privatizer = privatize(func);
274 auto& body = privatizer.privateCopy();
275
276 for( Int32 i=0; i<size; ++i ){
277 func(begin+i);
278 }
279}
280
281/*---------------------------------------------------------------------------*/
282/*---------------------------------------------------------------------------*/
283
284// Fonction vide pour simuler un noyau invalide car non compilé avec
285// le compilateur adéquant. Ne devrait normalement pas être appelé.
286template<typename Lambda,typename... LambdaArgs>
287inline void invalidKernel(Lambda&,const LambdaArgs&...)
288{
289 ARCANE_FATAL("Invalid kernel");
290}
291
292template<typename Lambda,typename... LambdaArgs>
294{
295};
296
297/*---------------------------------------------------------------------------*/
298/*---------------------------------------------------------------------------*/
299
312template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
313_applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
314 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
315{
316#if defined(ARCANE_COMPILING_CUDA)
317 Int32 wanted_shared_memory = 0;
318 auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
319 cudaStream_t s = CudaUtils::toNativeStream(launch_info._internalNativeStream());
320 // TODO: utiliser cudaLaunchKernel() à la place.
321 kernel<<<tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s>>>(args, func, other_args...);
322#else
323 ARCANE_UNUSED(launch_info);
324 ARCANE_UNUSED(kernel);
325 ARCANE_UNUSED(func);
326 ARCANE_UNUSED(args);
327 ARCANE_FATAL_NO_CUDA_COMPILATION();
328#endif
329}
330
331/*---------------------------------------------------------------------------*/
332/*---------------------------------------------------------------------------*/
340template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
341_applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
342 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
343{
344#if defined(ARCANE_COMPILING_HIP)
345 Int32 wanted_shared_memory = 0;
346 auto tbi = launch_info._threadBlockInfo(reinterpret_cast<const void*>(kernel), wanted_shared_memory);
347 hipStream_t s = HipUtils::toNativeStream(launch_info._internalNativeStream());
348 hipLaunchKernelGGL(kernel, tbi.nbBlockPerGrid(), tbi.nbThreadPerBlock(), wanted_shared_memory, s, args, func, other_args...);
349#else
350 ARCANE_UNUSED(launch_info);
351 ARCANE_UNUSED(kernel);
352 ARCANE_UNUSED(func);
353 ARCANE_UNUSED(args);
354 ARCANE_FATAL_NO_HIP_COMPILATION();
355#endif
356}
357
358/*---------------------------------------------------------------------------*/
359/*---------------------------------------------------------------------------*/
367template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs>
368void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
369 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... remaining_args)
370{
371#if defined(ARCANE_COMPILING_SYCL)
372 sycl::queue s = SyclUtils::toNativeStream(launch_info._internalNativeStream());
373 sycl::event event;
374 if constexpr (sizeof...(RemainingArgs) > 0) {
375 auto tbi = launch_info.kernelLaunchArgs();
376 Int32 b = tbi.nbBlockPerGrid();
377 Int32 t = tbi.nbThreadPerBlock();
378 sycl::nd_range<1> loop_size(b * t, t);
379 event = s.parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, remaining_args...); });
380 }
381 else {
382 sycl::range<1> loop_size = launch_info.totalLoopSize();
383 event = s.parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
384 }
385 launch_info._addSyclEvent(&event);
386#else
387 ARCANE_UNUSED(launch_info);
388 ARCANE_UNUSED(kernel);
389 ARCANE_UNUSED(func);
390 ARCANE_UNUSED(args);
391 ARCANE_FATAL_NO_SYCL_COMPILATION();
392#endif
393}
394
395/*---------------------------------------------------------------------------*/
396/*---------------------------------------------------------------------------*/
397
398} // End namespace Arcane::Accelerator::impl
399
400/*---------------------------------------------------------------------------*/
401/*---------------------------------------------------------------------------*/
402
403#define ARCANE_MACRO_PARENS ()
404
405// Les trois macros suivantes permettent de générer récursivement un ensemble
406// de paramètres. Si on veut supporter plus de paramètres, on peut ajouter
407// des appels à la macro suivante dans chaque macro.
408// Plus d'info ici: https://stackoverflow.com/questions/70238923/how-to-expand-a-recursive-macro-via-va-opt-in-a-nested-context
409#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
410#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
411#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
412
413#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
414 , decltype(a1)& a1 \
415 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
416
417#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
418
419/*
420 * \brief Macro pour générer les arguments de la lambda.
421 *
422 * Cette macro est interne à Arcane et ne doit pas être utilisée en dehors de Arcane.
423 *
424 * Cette macro permet de générer pour chaque argument \a arg une valeur `decltype(arg)& arg`.
425 *
426 * Par exemple:
427 * \code
428 * ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(value1,value2)
429 * // Cela génère le code suivant:
430 * , decltype(value1)&, decltype(value2)&
431 * \encode
432 */
433#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
434 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
435
436
437/*---------------------------------------------------------------------------*/
438/*---------------------------------------------------------------------------*/
439
440#endif
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
static ARCCORE_DEVICE void applyRemainingArgs(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels.
Object temporaire pour conserver les informations d'exécution d'une commande et regrouper les tests.
Int64 totalLoopSize() const
Taille totale de la boucle.
KernelLaunchArgs kernelLaunchArgs() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
KernelLaunchArgs _threadBlockInfo(const void *func, Int64 shared_memory_size) const
Informations dynamiques sur le nombre de block/thread/grille du noyau à lancer.
friend void _applyKernelCUDA(impl::RunCommandLaunchInfo &launch_info, const CudaKernel &kernel, Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel CUDA.
friend void _applyKernelHIP(impl::RunCommandLaunchInfo &launch_info, const HipKernel &kernel, const Lambda &func, const LambdaArgs &args, const RemainingArgs &... other_args)
Fonction générique pour exécuter un kernel HIP.
Vue d'un tableau d'éléments de type T.
Definition Span.h:673
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
Definition Span.h:212
Int32 Integer
Type représentant un entier.
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
std::int32_t Int32
Type entier signé sur 32 bits.