Arcane  v3.14.10.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
RunQueueInternal.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* RunQueueInternal.h (C) 2000-2024 */
9/* */
10/* Implémentation de la gestion d'une file d'exécution sur accélérateur. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_RUNQUEUEINTERNAL_H
13#define ARCANE_ACCELERATOR_RUNQUEUEINTERNAL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/CheckedConvert.h"
18#include "arcane/utils/LoopRanges.h"
19
20#include "arcane/accelerator/AcceleratorGlobal.h"
21#include "arcane/accelerator/RunCommandLaunchInfo.h"
22
23#if defined(ARCANE_COMPILING_HIP)
24#include <hip/hip_runtime.h>
25#endif
26#if defined(ARCANE_COMPILING_SYCL)
27#include <sycl/sycl.hpp>
28#endif
29
30#include <tuple>
31
32/*---------------------------------------------------------------------------*/
33/*---------------------------------------------------------------------------*/
34
35#if defined(ARCANE_COMPILING_CUDA)
36#define ARCANE_KERNEL_CUDA_FUNC(a) a
37#else
38#define ARCANE_KERNEL_CUDA_FUNC(a) Arcane::Accelerator::impl::invalidKernel
39#endif
40
41#if defined(ARCANE_COMPILING_HIP)
42#define ARCANE_KERNEL_HIP_FUNC(a) a
43#else
44#define ARCANE_KERNEL_HIP_FUNC(a) Arcane::Accelerator::impl::invalidKernel
45#endif
46
47#if defined(ARCANE_COMPILING_SYCL)
48#define ARCANE_KERNEL_SYCL_FUNC(a) a
49#else
50#define ARCANE_KERNEL_SYCL_FUNC(a) Arcane::Accelerator::impl::InvalidKernelClass
51#endif
52
53/*---------------------------------------------------------------------------*/
54/*---------------------------------------------------------------------------*/
55
56namespace Arcane::Accelerator::impl
57{
58
59template <typename T>
61{
62 using value_type = T;
63 using reference_type = value_type&;
64 value_type m_private_copy;
65
66 ARCCORE_HOST_DEVICE Privatizer(const T& o) : m_private_copy{o} {}
67 ARCCORE_HOST_DEVICE reference_type privateCopy() { return m_private_copy; }
68};
69
70template <typename T>
71ARCCORE_HOST_DEVICE auto privatize(const T& item) -> Privatizer<T>
72{
73 return Privatizer<T>{item};
74}
75
76/*---------------------------------------------------------------------------*/
77/*---------------------------------------------------------------------------*/
78/*!
79 * \brief Classe pour appliquer la finalisation des réductions.
80 */
82{
83 public:
84
85 //! Applique les fonctors des arguments additionnels.
86 template <typename... ReducerArgs> static inline ARCCORE_DEVICE void
87 applyReducerArgs(Int32 index, ReducerArgs&... reducer_args)
88 {
89 // Applique les réductions
90 (reducer_args._internalExecWorkItem(index), ...);
91 }
92
93#if defined(ARCANE_COMPILING_SYCL)
94 //! Applique les fonctors des arguments additionnels.
95 template <typename... ReducerArgs> static inline ARCCORE_HOST_DEVICE void
96 applyReducerArgs(sycl::nd_item<1> x, ReducerArgs&... reducer_args)
97 {
98 // Applique les réductions
99 (reducer_args._internalExecWorkItem(x), ...);
100 }
101#endif
102};
103
104/*---------------------------------------------------------------------------*/
105/*---------------------------------------------------------------------------*/
106
107#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
108
109template <typename BuilderType, typename Lambda> __global__ void
110doIndirectGPULambda(SmallSpan<const Int32> ids, Lambda func)
111{
112 using LocalIdType = BuilderType::ValueType;
113
114 auto privatizer = privatize(func);
115 auto& body = privatizer.privateCopy();
116
117 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
118 if (i < ids.size()) {
119 LocalIdType lid(ids[i]);
120 //if (i<10)
121 //printf("CUDA %d lid=%d\n",i,lid.localId());
122 body(BuilderType::create(i, lid));
123 }
124}
125
126template <typename ItemType, typename Lambda> __global__ void
127doDirectGPULambda(Int32 vsize, Lambda func)
128{
129 auto privatizer = privatize(func);
130 auto& body = privatizer.privateCopy();
131
132 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
133 if (i < vsize) {
134 //if (i<10)
135 //printf("CUDA %d lid=%d\n",i,lid.localId());
136 body(i);
137 }
138}
139
140template <typename LoopBoundType, typename Lambda> __global__ void
141doDirectGPULambdaArrayBounds(LoopBoundType bounds, Lambda func)
142{
143 auto privatizer = privatize(func);
144 auto& body = privatizer.privateCopy();
145
146 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
147 if (i < bounds.nbElement()) {
148 body(bounds.getIndices(i));
149 }
150}
151
152template <typename TraitsType, typename Lambda, typename... ReducerArgs> __global__ void
153doIndirectGPULambda2(SmallSpan<const Int32> ids, Lambda func, ReducerArgs... reducer_args)
154{
155 using BuilderType = TraitsType::BuilderType;
156 using LocalIdType = BuilderType::ValueType;
157
158 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
159 auto privatizer = privatize(func);
160 auto& body = privatizer.privateCopy();
161
162 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
163 if (i < ids.size()) {
164 LocalIdType lid(ids[i]);
165 body(BuilderType::create(i, lid), reducer_args...);
166 }
167 KernelReducerHelper::applyReducerArgs(i, reducer_args...);
168}
169
170template <typename ItemType, typename Lambda, typename... ReducerArgs> __global__ void
171doDirectGPULambda2(Int32 vsize, Lambda func, ReducerArgs... reducer_args)
172{
173 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
174 auto privatizer = privatize(func);
175 auto& body = privatizer.privateCopy();
176
177 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
178 if (i < vsize) {
179 body(i, reducer_args...);
180 }
181 KernelReducerHelper::applyReducerArgs(i, reducer_args...);
182}
183
184template <typename LoopBoundType, typename Lambda, typename... ReducerArgs> __global__ void
185doDirectGPULambdaArrayBounds2(LoopBoundType bounds, Lambda func, ReducerArgs... reducer_args)
186{
187 // TODO: a supprimer quand il n'y aura plus les anciennes réductions
188 auto privatizer = privatize(func);
189 auto& body = privatizer.privateCopy();
190
191 Int32 i = blockDim.x * blockIdx.x + threadIdx.x;
192 if (i < bounds.nbElement()) {
193 body(bounds.getIndices(i), reducer_args...);
194 }
195 KernelReducerHelper::applyReducerArgs(i, reducer_args...);
196}
197
198/*---------------------------------------------------------------------------*/
199/*---------------------------------------------------------------------------*/
200
201#endif // ARCANE_COMPILING_CUDA || ARCANE_COMPILING_HIP
202
203/*---------------------------------------------------------------------------*/
204/*---------------------------------------------------------------------------*/
205
206#if defined(ARCANE_COMPILING_SYCL)
207
208//! Boucle N-dimension sans indirection
209template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>
210class DoDirectSYCLLambdaArrayBounds
211{
212 public:
213
214 void operator()(sycl::nd_item<1> x, LoopBoundType bounds, Lambda func, RemainingArgs... reducer_args) const
215 {
216 auto privatizer = privatize(func);
217 auto& body = privatizer.privateCopy();
218
219 Int32 i = static_cast<Int32>(x.get_global_id(0));
220 if (i < bounds.nbElement()) {
221 body(bounds.getIndices(i), reducer_args...);
222 }
223 KernelReducerHelper::applyReducerArgs(x, reducer_args...);
224 }
225 void operator()(sycl::id<1> x, LoopBoundType bounds, Lambda func) const
226 {
227 auto privatizer = privatize(func);
228 auto& body = privatizer.privateCopy();
229
230 Int32 i = static_cast<Int32>(x);
231 if (i < bounds.nbElement()) {
232 body(bounds.getIndices(i));
233 }
234 }
235};
236
237//! Boucle 1D avec indirection
238template <typename TraitsType, typename Lambda, typename... ReducerArgs>
239class DoIndirectSYCLLambda
240{
241 public:
242
243 void operator()(sycl::nd_item<1> x, SmallSpan<const Int32> ids, Lambda func, ReducerArgs... reducer_args) const
244 {
245 using BuilderType = TraitsType::BuilderType;
246 using LocalIdType = BuilderType::ValueType;
247 auto privatizer = privatize(func);
248 auto& body = privatizer.privateCopy();
249
250 Int32 i = static_cast<Int32>(x.get_global_id(0));
251 if (i < ids.size()) {
252 LocalIdType lid(ids[i]);
253 body(BuilderType::create(i, lid), reducer_args...);
254 }
255 KernelReducerHelper::applyReducerArgs(x, reducer_args...);
256 }
257 void operator()(sycl::id<1> x, SmallSpan<const Int32> ids, Lambda func) const
258 {
259 using BuilderType = TraitsType::BuilderType;
260 using LocalIdType = BuilderType::ValueType;
261 auto privatizer = privatize(func);
262 auto& body = privatizer.privateCopy();
263
264 Int32 i = static_cast<Int32>(x);
265 if (i < ids.size()) {
266 LocalIdType lid(ids[i]);
267 body(BuilderType::create(i, lid));
268 }
269 }
270};
271
272#endif
273
274/*---------------------------------------------------------------------------*/
275/*---------------------------------------------------------------------------*/
276
277template<typename Lambda>
278void doDirectThreadLambda(Integer begin,Integer size,Lambda func)
279{
280 auto privatizer = privatize(func);
281 auto& body = privatizer.privateCopy();
282
283 for( Int32 i=0; i<size; ++i ){
284 func(begin+i);
285 }
286}
287
288/*---------------------------------------------------------------------------*/
289/*---------------------------------------------------------------------------*/
290
291// Fonction vide pour simuler un noyau invalide car non compilé avec
292// le compilateur adéquant. Ne devrait normalement pas être appelé.
293template<typename Lambda,typename... LambdaArgs>
294inline void invalidKernel(Lambda&,const LambdaArgs&...)
295{
296 ARCANE_FATAL("Invalid kernel");
297}
298
299template<typename Lambda,typename... LambdaArgs>
301{
302};
303
304/*---------------------------------------------------------------------------*/
305/*---------------------------------------------------------------------------*/
306
307/*!
308 * \brief Fonction générique pour exécuter un kernel CUDA.
309 *
310 * \param kernel noyau CUDA
311 * \param func fonction à exécuter par le noyau
312 * \param args arguments de la fonction lambda
313 *
314 * TODO: Tester si Lambda est bien un fonction, le SFINAE étant peu lisible :
315 * typename std::enable_if_t<std::is_function_v<std::decay_t<Lambda> > >* = nullptr
316 * attendons les concepts c++20 (requires)
317 *
318 */
319template <typename CudaKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
320_applyKernelCUDA(impl::RunCommandLaunchInfo& launch_info, const CudaKernel& kernel, Lambda& func,
321 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
322{
323#if defined(ARCANE_COMPILING_CUDA)
324 auto [b, t] = launch_info.threadBlockInfo();
325 cudaStream_t* s = reinterpret_cast<cudaStream_t*>(launch_info._internalStreamImpl());
326 // TODO: utiliser cudaLaunchKernel() à la place.
327 kernel<<<b, t, 0, *s>>>(args, func, other_args...);
328#else
329 ARCANE_UNUSED(launch_info);
330 ARCANE_UNUSED(kernel);
331 ARCANE_UNUSED(func);
332 ARCANE_UNUSED(args);
333 ARCANE_FATAL_NO_CUDA_COMPILATION();
334#endif
335}
336
337/*---------------------------------------------------------------------------*/
338/*---------------------------------------------------------------------------*/
339/*!
340 * \brief Fonction générique pour exécuter un kernel HIP.
341 *
342 * \param kernel noyau HIP
343 * \param func fonction à exécuter par le noyau
344 * \param args arguments de la fonction lambda
345 */
346template <typename HipKernel, typename Lambda, typename LambdaArgs, typename... RemainingArgs> void
347_applyKernelHIP(impl::RunCommandLaunchInfo& launch_info, const HipKernel& kernel, const Lambda& func,
348 const LambdaArgs& args, [[maybe_unused]] const RemainingArgs&... other_args)
349{
350#if defined(ARCANE_COMPILING_HIP)
351 auto [b, t] = launch_info.threadBlockInfo();
352 hipStream_t* s = reinterpret_cast<hipStream_t*>(launch_info._internalStreamImpl());
353 hipLaunchKernelGGL(kernel, b, t, 0, *s, args, func, other_args...);
354#else
355 ARCANE_UNUSED(launch_info);
356 ARCANE_UNUSED(kernel);
357 ARCANE_UNUSED(func);
358 ARCANE_UNUSED(args);
359 ARCANE_FATAL_NO_HIP_COMPILATION();
360#endif
361}
362
363/*---------------------------------------------------------------------------*/
364/*---------------------------------------------------------------------------*/
365/*!
366 * \brief Fonction générique pour exécuter un kernel SYCL.
367 *
368 * \param kernel noyau SYCL
369 * \param func fonction à exécuter par le noyau
370 * \param args arguments de la fonction lambda
371 */
372template <typename SyclKernel, typename Lambda, typename LambdaArgs, typename... ReducerArgs>
373void _applyKernelSYCL(impl::RunCommandLaunchInfo& launch_info, SyclKernel kernel, Lambda& func,
374 const LambdaArgs& args, [[maybe_unused]] const ReducerArgs&... reducer_args)
375{
376#if defined(ARCANE_COMPILING_SYCL)
377 sycl::queue* s = reinterpret_cast<sycl::queue*>(launch_info._internalStreamImpl());
378 sycl::event event;
379 if constexpr (sizeof...(ReducerArgs) > 0) {
380 auto [b, t] = launch_info.threadBlockInfo();
381 sycl::nd_range<1> loop_size(b * t, t);
382 event = s->parallel_for(loop_size, [=](sycl::nd_item<1> i) { kernel(i, args, func, reducer_args...); });
383 }
384 else {
385 sycl::range<1> loop_size = launch_info.totalLoopSize();
386 event = s->parallel_for(loop_size, [=](sycl::id<1> i) { kernel(i, args, func); });
387 }
388 launch_info._addSyclEvent(&event);
389#else
390 ARCANE_UNUSED(launch_info);
391 ARCANE_UNUSED(kernel);
392 ARCANE_UNUSED(func);
393 ARCANE_UNUSED(args);
394 ARCANE_FATAL_NO_SYCL_COMPILATION();
395#endif
396}
397
398/*---------------------------------------------------------------------------*/
399/*---------------------------------------------------------------------------*/
400
401} // End namespace Arcane::Accelerator::impl
402
403/*---------------------------------------------------------------------------*/
404/*---------------------------------------------------------------------------*/
405
406#define ARCANE_MACRO_PARENS ()
407
408// Les trois macros suivantes permettent de générer récursivement un ensemble
409// de paramètres. Si on veut supporter plus de paramètres, on peut ajouter
410// des appels à la macro suivante dans chaque macro.
411// Plus d'info ici: https://stackoverflow.com/questions/70238923/how-to-expand-a-recursive-macro-via-va-opt-in-a-nested-context
412#define ARCANE_MACRO_EXPAND(...) ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(ARCANE_MACRO_EXPAND2(__VA_ARGS__)))
413#define ARCANE_MACRO_EXPAND2(...) ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(ARCANE_MACRO_EXPAND1(__VA_ARGS__)))
414#define ARCANE_MACRO_EXPAND1(...) __VA_ARGS__
415
416#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(a1, ...) \
417 , decltype(a1)& a1 \
418 __VA_OPT__(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN ARCANE_MACRO_PARENS(__VA_ARGS__))
419
420#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_AGAIN() ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER
421
422/*
423 * \brief Macro pour générer les arguments de la lambda.
424 *
425 * Cette macro est interne à Arcane et ne doit pas être utilisée en dehors de Arcane.
426 *
427 * Cette macro permet de générer pour chaque argument \a arg une valeur `decltype(arg)& arg`.
428 *
429 * Par exemple:
430 * \code
431 * ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(value1,value2)
432 * // Cela génère le code suivant:
433 * , decltype(value1)&, decltype(value2)&
434 * \encode
435 */
436#define ARCANE_RUNCOMMAND_REDUCER_FOR_EACH(...) \
437 __VA_OPT__(ARCANE_MACRO_EXPAND(ARCANE_RUNCOMMAND_REDUCER_FOR_EACH_HELPER(__VA_ARGS__)))
438
439
440/*---------------------------------------------------------------------------*/
441/*---------------------------------------------------------------------------*/
442
443#endif
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Classe pour appliquer la finalisation des réductions.
static ARCCORE_DEVICE void applyReducerArgs(Int32 index, ReducerArgs &... reducer_args)
Applique les fonctors des arguments additionnels.
Int64 totalLoopSize() const
Taille totale de la boucle.
ThreadBlockInfo threadBlockInfo() const
Informations sur le nombre de block/thread/grille du noyau à lancer.
Vue d'un tableau d'éléments de type T.
Definition Span.h:670
Int32 LocalIdType
Type des entiers utilisés pour stocker les identifiants locaux des entités.
Int32 Integer
Type représentant un entier.
std::int32_t Int32
Type entier signé sur 32 bits.