Arcane  v4.1.1.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
Reduce.h
Aller à la documentation de ce fichier.
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Reduce.h (C) 2000-2025 */
9/* */
10/* Gestion des réductions pour les accélérateurs. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
19
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
25
26#include <limits.h>
27#include <float.h>
28#include <atomic>
29#include <iostream>
30
31/*---------------------------------------------------------------------------*/
32/*---------------------------------------------------------------------------*/
33
34namespace Arcane::Accelerator::Impl
35{
37}
38
39namespace Arcane::impl
40{
41class HostReducerHelper;
42}
43
44namespace Arcane::Accelerator::impl
45{
46class KernelReducerHelper;
47
48/*---------------------------------------------------------------------------*/
49/*---------------------------------------------------------------------------*/
50
51extern "C++" ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl*
52internalGetOrCreateReduceMemoryImpl(RunCommand* command);
53
54template <typename DataType>
56template <>
57// TODO: utiliser numeric_limits.
58class ReduceIdentity<double>
59{
60 public:
61
62 ARCCORE_HOST_DEVICE static constexpr double sumValue() { return 0.0; }
63 ARCCORE_HOST_DEVICE static constexpr double minValue() { return DBL_MAX; }
64 ARCCORE_HOST_DEVICE static constexpr double maxValue() { return -DBL_MAX; }
65};
66template <>
68{
69 public:
70
71 ARCCORE_HOST_DEVICE static constexpr Int32 sumValue() { return 0; }
72 ARCCORE_HOST_DEVICE static constexpr Int32 minValue() { return INT32_MAX; }
73 ARCCORE_HOST_DEVICE static constexpr Int32 maxValue() { return -INT32_MAX; }
74};
75template <>
77{
78 public:
79
80 ARCCORE_HOST_DEVICE static constexpr Int64 sumValue() { return 0; }
81 ARCCORE_HOST_DEVICE static constexpr Int64 minValue() { return INT64_MAX; }
82 ARCCORE_HOST_DEVICE static constexpr Int64 maxValue() { return -INT64_MAX; }
83};
84
85/*---------------------------------------------------------------------------*/
86/*---------------------------------------------------------------------------*/
87// L'implémentation utilisée est définie dans 'CommonCudaHipReduceImpl.h'
88
89/*---------------------------------------------------------------------------*/
90/*---------------------------------------------------------------------------*/
91/*!
92 * \internal
93 * \brief Informations pour effectuer une réduction sur un device.
94 */
95template <typename DataType>
97{
98 public:
99
100 //! Valeur du thread courant à réduire.
101 DataType m_current_value = {};
102 //! Valeur de l'identité pour la réduction
103 DataType m_identity = {};
104 //! Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
105 DataType* m_device_final_ptr = nullptr;
106 //! Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
107 void* m_host_final_ptr = nullptr;
108 //! Tableau avec une valeur par bloc pour la réduction
110 /*!
111 * Pointeur vers une zone mémoire contenant un entier pour indiquer
112 * combien il reste de blocs à réduire.
113 */
114 unsigned int* m_device_count = nullptr;
115
116 //! Taille d'un warp
118};
119
120/*---------------------------------------------------------------------------*/
121/*---------------------------------------------------------------------------*/
122
123template <typename DataType>
125
126template <>
127class ReduceAtomicSum<double>
128{
129 public:
130
131 static double apply(double* vptr, double v)
132 {
133 std::atomic_ref<double> aref(*vptr);
134 double old = aref.load(std::memory_order_consume);
135 double wanted = old + v;
136 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
137 wanted = old + v;
138 return wanted;
139 }
140};
141template <>
143{
144 public:
145
146 static Int64 apply(Int64* vptr, Int64 v)
147 {
148 std::atomic_ref<Int64> aref(*vptr);
149 Int64 x = aref.fetch_add(v);
150 return x + v;
151 }
152};
153template <>
155{
156 public:
157
158 static Int32 apply(Int32* vptr, Int32 v)
159 {
160 std::atomic_ref<Int32> aref(*vptr);
161 Int32 x = aref.fetch_add(v);
162 return x + v;
163 }
164};
165
166/*---------------------------------------------------------------------------*/
167/*---------------------------------------------------------------------------*/
168
169template <typename DataType>
171{
172 public:
173
174 static ARCCORE_DEVICE DataType
175 applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
176 {
177 _applyDevice(dev_info);
178 return *(dev_info.m_device_final_ptr);
179 }
180 static DataType apply(DataType* vptr, DataType v)
181 {
182 return ReduceAtomicSum<DataType>::apply(vptr, v);
183 }
184#if defined(ARCANE_COMPILING_SYCL)
185 static sycl::plus<DataType> syclFunctor() { return {}; }
186#endif
187
188 public:
189
190 ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::sumValue(); }
191
192 private:
193
194 static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);
195};
196
197/*---------------------------------------------------------------------------*/
198/*---------------------------------------------------------------------------*/
199
200template <typename DataType>
202{
203 public:
204
205 static ARCCORE_DEVICE DataType
206 applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
207 {
208 _applyDevice(dev_info);
209 return *(dev_info.m_device_final_ptr);
210 }
211 static DataType apply(DataType* ptr, DataType v)
212 {
213 std::atomic_ref<DataType> aref(*ptr);
214 DataType prev_value = aref.load();
215 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
216 }
217 return aref.load();
218 }
219#if defined(ARCANE_COMPILING_SYCL)
220 static sycl::maximum<DataType> syclFunctor() { return {}; }
221#endif
222
223 public:
224
225 ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::maxValue(); }
226
227 private:
228
229 static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);
230};
231
232/*---------------------------------------------------------------------------*/
233/*---------------------------------------------------------------------------*/
234
235template <typename DataType>
237{
238 public:
239
240 static ARCCORE_DEVICE DataType
241 applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
242 {
243 _applyDevice(dev_info);
244 return *(dev_info.m_device_final_ptr);
245 }
246 static DataType apply(DataType* vptr, DataType v)
247 {
248 std::atomic_ref<DataType> aref(*vptr);
249 DataType prev_value = aref.load();
250 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
251 }
252 return aref.load();
253 }
254#if defined(ARCANE_COMPILING_SYCL)
255 static sycl::minimum<DataType> syclFunctor() { return {}; }
256#endif
257
258 public:
259
260 ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::minValue(); }
261
262 private:
263
264 static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);
265};
266
267/*---------------------------------------------------------------------------*/
268/*---------------------------------------------------------------------------*/
269
270} // namespace Arcane::Accelerator::impl
271
272namespace Arcane::Accelerator
273{
274
275/*---------------------------------------------------------------------------*/
276/*---------------------------------------------------------------------------*/
277
278/*!
279 * \brief Opérateur de réduction
280 *
281 * Cette classe permet de gérer une réduction sur accélérateur ou en
282 * multi-thread.
283 *
284 * La réduction finale a lieu lors de l'appel à reduce(). Il ne faut donc
285 * faire cet appel qu'une seule fois et dans une partie collective. Cet appel
286 * n'est valide que sur les instance créées avec un constructeur vide. Ces dernières
287 * ne peuvent être créées que sur l'hôte.
288 *
289 * \warning Le constructeur de recopie ne doit pas être appelé explicitement.
290 * L'instance de départ doit rester valide tant qu'il existe des copies ou
291 * des références dans le noyau de calcul.
292 *
293 * NOTE sur l'implémentation
294 *
295 * Sur GPU, les réductions sont effectuées dans le destructeur de la classe
296 * La valeur 'm_host_or_device_memory_for_reduced_value' sert à conserver ces valeurs.
297 * Sur l'hôte, on utilise un 'std::atomic' pour conserver la valeur commune
298 * entre les threads. Cette valeur est référencée par 'm_parent_value' et n'est
299 * valide que sur l'hôte.
300 */
301template <typename DataType, typename ReduceFunctor>
302class HostDeviceReducerBase
303{
304 public:
305
306 HostDeviceReducerBase(RunCommand& command)
308 , m_command(&command)
309 {
310 //std::cout << String::format("Reduce main host this={0}\n",this); std::cout.flush();
311 m_is_master_instance = true;
312 m_identity = ReduceFunctor::identity();
313 m_local_value = m_identity;
314 m_atomic_value = m_identity;
315 m_atomic_parent_value = &m_atomic_value;
316 //printf("Create null host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this);
317 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
318 if (m_memory_impl) {
319 m_host_or_device_memory_for_reduced_value = impl::allocateReduceDataMemory<DataType>(m_memory_impl, m_identity);
320 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
321 }
322 }
323
324 // Le compilateur Intel considère que cette classe n'est pas 'is_trivially_copyable'
325 // sur le device si on n'utilise pas le constructeur de copie.
326#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
327 HostDeviceReducerBase(const HostDeviceReducerBase& rhs) = default;
328#else
329 ARCCORE_HOST_DEVICE HostDeviceReducerBase(const HostDeviceReducerBase& rhs)
331 , m_local_value(rhs.m_local_value)
332 , m_identity(rhs.m_identity)
333 {
334#ifdef ARCCORE_DEVICE_CODE
335 m_grid_memory_info = rhs.m_grid_memory_info;
336 //int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;
337 //if (threadId==0)
338 //printf("Create ref device Id=%d parent=%p\n",threadId,&rhs);
339#else
340 m_memory_impl = rhs.m_memory_impl;
341 if (m_memory_impl) {
342 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
343 }
344 //std::cout << String::format("Reduce: host copy this={0} rhs={1} mem={2} device_count={3}\n",this,&rhs,m_memory_impl,(void*)m_grid_device_count);
345 m_atomic_parent_value = rhs.m_atomic_parent_value;
346 m_local_value = rhs.m_identity;
347 m_atomic_value = m_identity;
348 //std::cout << String::format("Reduce copy host this={0} parent_value={1} rhs={2}\n",this,(void*)m_parent_value,&rhs); std::cout.flush();
349 //if (!rhs.m_is_master_instance)
350 //ARCANE_FATAL("Only copy from master instance is allowed");
351 //printf("Create ref host parent_value=%p this=%p rhs=%p\n",(void*)m_parent_value,(void*)this,(void*)&rhs);
352#endif
353 }
354#endif
355
356 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) = delete;
357 HostDeviceReducerBase& operator=(const HostDeviceReducerBase& rhs) = delete;
358
359 public:
360
361 ARCCORE_HOST_DEVICE void setValue(DataType v)
362 {
363 m_local_value = v;
364 }
365 ARCCORE_HOST_DEVICE DataType localValue() const
366 {
367 return m_local_value;
368 }
369
370 protected:
371
372 impl::IReduceMemoryImpl* m_memory_impl = nullptr;
373 /*!
374 * \brief Pointeur vers la donnée qui contiendra la valeur réduite.
375 *
376 * Sur accélérateur, cette donnée est allouée sur le device.
377 * Sur CPU, il s'agit de l'adresse de \a m_local_value pour l'instance parente.
378 */
380 impl::IReduceMemoryImpl::GridMemoryInfo m_grid_memory_info;
381
382 private:
383
384 RunCommand* m_command = nullptr;
385
386 protected:
387
388 mutable DataType m_local_value;
389 DataType* m_atomic_parent_value = nullptr;
390 mutable DataType m_atomic_value;
391
392 private:
393
394 DataType m_identity;
395 //bool m_is_allocated = false;
396 bool m_is_master_instance = false;
397
398 protected:
399
400 //! Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
401 DataType _reduce()
402 {
403 if (!m_is_master_instance)
404 ARCANE_FATAL("Final reduce operation is only valid on master instance");
405 // Si la réduction est faite sur accélérateur, il faut recopier la valeur du device sur l'hôte.
406 DataType* final_ptr = m_host_or_device_memory_for_reduced_value;
407 if (m_memory_impl) {
408 m_memory_impl->copyReduceValueFromDevice();
409 final_ptr = reinterpret_cast<DataType*>(m_grid_memory_info.m_host_memory_for_reduced_value);
410 m_memory_impl->release();
411 m_memory_impl = nullptr;
412 }
413
414 if (m_atomic_parent_value) {
415 //std::cout << String::format("Reduce host has parent this={0} local_value={1} parent_value={2}\n",
416 // this,m_local_value,*m_parent_value);
417 //std::cout.flush();
418 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
419 *final_ptr = *m_atomic_parent_value;
420 }
421 else {
422 //std::cout << String::format("Reduce host no parent this={0} local_value={1} managed={2}\n",
423 // this,m_local_value,*m_host_or_device_memory_for_reduced_value);
424 //std::cout.flush();
425 }
426 return *final_ptr;
427 }
428
429 // NOTE: Lorsqu'il n'y aura plus la version V1 de la réduction, cette méthode ne sera
430 // appelée que depuis le device.
431 ARCCORE_HOST_DEVICE void
432 _finalize()
433 {
434#ifdef ARCCORE_DEVICE_CODE
435 //int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;
436 //if ((threadId%16)==0)
437 //printf("Destroy device Id=%d\n",threadId);
438 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
439 DataType* buf = reinterpret_cast<DataType*>(buf_span.data());
440 SmallSpan<DataType> grid_buffer(buf, static_cast<Int32>(buf_span.size()));
441
443 dvi.m_grid_buffer = grid_buffer;
444 dvi.m_device_count = m_grid_memory_info.m_grid_device_count;
446 dvi.m_host_final_ptr = m_grid_memory_info.m_host_memory_for_reduced_value;
447 dvi.m_current_value = m_local_value;
448 dvi.m_identity = m_identity;
449 dvi.m_warp_size = m_grid_memory_info.m_warp_size;
450 ReduceFunctor::applyDevice(dvi); //grid_buffer,m_grid_device_count,m_host_or_device_memory_for_reduced_value,m_local_value,m_identity);
451#else
452 // printf("Destroy host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this);
453 // Code hôte
454 //std::cout << String::format("Reduce destructor this={0} parent_value={1} v={2} memory_impl={3}\n",this,(void*)m_parent_value,m_local_value,m_memory_impl);
455 //std::cout << String::format("Reduce destructor this={0} grid_data={1} grid_size={2}\n",
456 // this,(void*)m_grid_memory_value_as_bytes,m_grid_memory_size);
457 //std::cout.flush();
458 if (!m_is_master_instance)
459 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
460
461 //printf("Destroy host %p %p\n",m_host_or_device_memory_for_reduced_value,this);
462#endif
463 }
464};
465
466/*---------------------------------------------------------------------------*/
467/*---------------------------------------------------------------------------*/
468/*!
469 * \brief Version 1 de la réduction.
470 *
471 * Cette version est obsolète. Elle utilise le destructeur de la classe
472 * pour effectuer la réduction.
473 */
474template <typename DataType, typename ReduceFunctor>
475class HostDeviceReducer
476: public HostDeviceReducerBase<DataType, ReduceFunctor>
477{
478 public:
479
480 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
481
482 public:
483
484 explicit HostDeviceReducer(RunCommand& command)
485 : BaseClass(command)
486 {}
487 HostDeviceReducer(const HostDeviceReducer& rhs) = default;
488 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
489 {
490 this->_finalize();
491 }
492
493 public:
494
495 DataType reduce()
496 {
497 return this->_reduce();
498 }
499
500 DataType reducedValue()
501 {
502 return this->_reduce();
503 }
504};
505
506/*---------------------------------------------------------------------------*/
507/*---------------------------------------------------------------------------*/
508/*!
509 * \brief Version 2 de la réduction.
510 */
511template <typename DataType, typename ReduceFunctor>
512class HostDeviceReducer2
513: public HostDeviceReducerBase<DataType, ReduceFunctor>
514{
516
517 public:
518
519 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
520 using BaseClass::m_grid_memory_info;
522 using BaseClass::m_local_value;
523
524 using RemainingArgHandlerType = Impl::HostDeviceReducerKernelRemainingArg;
525
526 public:
527
528 explicit HostDeviceReducer2(RunCommand& command)
529 : BaseClass(command)
530 {}
531
532 public:
533
534 DataType reducedValue()
535 {
536 return this->_reduce();
537 }
538
539 private:
540
541
542#if defined(ARCANE_COMPILING_SYCL)
543 void _internalExecWorkItemAtEnd(sycl::nd_item<1> id)
544 {
545 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
546 const Int32 local_id = static_cast<Int32>(id.get_local_id(0));
547 const Int32 group_id = static_cast<Int32>(id.get_group_linear_id());
548 const Int32 nb_block = static_cast<Int32>(id.get_group_range(0));
549
550 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
551 DataType* buf = reinterpret_cast<DataType*>(buf_span.data());
552 SmallSpan<DataType> grid_buffer(buf, static_cast<Int32>(buf_span.size()));
553
554 DataType v = m_local_value;
555 bool is_last = false;
556 auto sycl_functor = ReduceFunctor::syclFunctor();
557 DataType local_sum = sycl::reduce_over_group(id.get_group(), v, sycl_functor);
558 if (local_id == 0) {
559 grid_buffer[group_id] = local_sum;
560
561 // TODO: En théorie il faut faire l'équivalent d'un __threadfence() ici
562 // pour garantir que les autres work-item voient bien la mise à jour de 'grid_buffer'.
563 // Mais ce mécanisme n'existe pas avec SYCL 2020.
564
565 // AdaptiveCpp 2024.2 ne supporte pas les opérations atomiques sur 'unsigned int'.
566 // Elles sont supportées avec le type 'int'. Comme on est certain de ne pas dépasser 2^31, on
567 // converti le pointeur en un 'int*'.
568#if defined(__ADAPTIVECPP__)
569 int* atomic_counter_ptr_as_int = reinterpret_cast<int*>(atomic_counter_ptr);
570 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
571#else
572 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
573#endif
574 Int32 cx = a.fetch_add(1);
575 if (cx == (nb_block - 1))
576 is_last = true;
577 }
578
579 // Je suis le dernier à faire la réduction.
580 // Calcule la réduction finale
581 if (is_last) {
582 DataType my_total = grid_buffer[0];
583 for (int x = 1; x < nb_block; ++x)
584 my_total = sycl_functor(my_total, grid_buffer[x]);
585 // Met le résultat final dans le premier élément du tableau.
586 grid_buffer[0] = my_total;
588 *atomic_counter_ptr = 0;
589 }
590 }
591#endif
592};
593
594/*---------------------------------------------------------------------------*/
595/*---------------------------------------------------------------------------*/
596/*!
597 * \brief Implémentation de la réduction pour le backend SYCL.
598 *
599 * \warning Pour l'instant il n'y aucune implémentation. Cette classe permet
600 * juste la compilation.
601 */
602template <typename DataType, typename ReduceFunctor>
603class SyclReducer
604{
605 public:
606
607 explicit SyclReducer(RunCommand&) {}
608
609 public:
610
611 DataType reduce()
612 {
613 return m_local_value;
614 }
615 void setValue(DataType v) { m_local_value = v; }
616
617 protected:
618
619 mutable DataType m_local_value = {};
620};
621
622/*---------------------------------------------------------------------------*/
623/*---------------------------------------------------------------------------*/
624
625#if defined(ARCANE_COMPILING_SYCL)
626template <typename DataType, typename ReduceFunctor> using Reducer = SyclReducer<DataType, ReduceFunctor>;
627#else
628template <typename DataType, typename ReduceFunctor> using Reducer = HostDeviceReducer<DataType, ReduceFunctor>;
629#endif
630
631/*---------------------------------------------------------------------------*/
632/*---------------------------------------------------------------------------*/
633
634/*---------------------------------------------------------------------------*/
635/*---------------------------------------------------------------------------*/
636/*!
637 * \brief Classe pour effectuer une réduction 'somme'.
638 */
639template <typename DataType>
640class ReducerSum
641: public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
642{
643 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
644 using BaseClass::m_local_value;
645
646 public:
647
648 explicit ReducerSum(RunCommand& command)
649 : BaseClass(command)
650 {}
651
652 public:
653
654 ARCCORE_HOST_DEVICE DataType combine(DataType v) const
655 {
656 m_local_value += v;
657 return m_local_value;
658 }
659
660 ARCCORE_HOST_DEVICE DataType add(DataType v) const
661 {
662 return combine(v);
663 }
664};
665
666/*---------------------------------------------------------------------------*/
667/*---------------------------------------------------------------------------*/
668/*!
669 * \brief Classe pour effectuer une réduction 'max'.
670 */
671template <typename DataType>
672class ReducerMax
673: public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
674{
675 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
676 using BaseClass::m_local_value;
677
678 public:
679
680 explicit ReducerMax(RunCommand& command)
681 : BaseClass(command)
682 {}
683
684 public:
685
686 ARCCORE_HOST_DEVICE DataType combine(DataType v) const
687 {
688 m_local_value = v > m_local_value ? v : m_local_value;
689 return m_local_value;
690 }
691
692 ARCCORE_HOST_DEVICE DataType max(DataType v) const
693 {
694 return combine(v);
695 }
696};
697
698/*---------------------------------------------------------------------------*/
699/*---------------------------------------------------------------------------*/
700
701/*---------------------------------------------------------------------------*/
702/*---------------------------------------------------------------------------*/
703/*!
704 * \brief Classe pour effectuer une réduction 'min'.
705 */
706template <typename DataType>
707class ReducerMin
708: public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
709{
710 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
711 using BaseClass::m_local_value;
712
713 public:
714
715 explicit ReducerMin(RunCommand& command)
716 : BaseClass(command)
717 {}
718
719 public:
720
721 ARCCORE_HOST_DEVICE DataType combine(DataType v) const
722 {
723 m_local_value = v < m_local_value ? v : m_local_value;
724 return m_local_value;
725 }
726
727 ARCCORE_HOST_DEVICE DataType min(DataType v) const
728 {
729 return combine(v);
730 }
731};
732
733/*---------------------------------------------------------------------------*/
734/*---------------------------------------------------------------------------*/
735
736/*---------------------------------------------------------------------------*/
737/*---------------------------------------------------------------------------*/
738/*!
739 * \brief Classe pour effectuer une réduction 'somme'.
740 */
741template <typename DataType>
742class ReducerSum2
743: public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
744{
745 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
746
747 public:
748
749 explicit ReducerSum2(RunCommand& command)
750 : BaseClass(command)
751 {}
752
753 public:
754
755 ARCCORE_HOST_DEVICE void combine(DataType v)
756 {
757 this->m_local_value += v;
758 }
759};
760
761/*---------------------------------------------------------------------------*/
762/*---------------------------------------------------------------------------*/
763/*!
764 * \brief Classe pour effectuer une réduction 'max'.
765 */
766template <typename DataType>
767class ReducerMax2
768: public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
769{
770 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
771
772 public:
773
774 explicit ReducerMax2(RunCommand& command)
775 : BaseClass(command)
776 {}
777
778 public:
779
780 ARCCORE_HOST_DEVICE void combine(DataType v)
781 {
782 DataType& lv = this->m_local_value;
783 lv = v > lv ? v : lv;
784 }
785};
786
787/*---------------------------------------------------------------------------*/
788/*---------------------------------------------------------------------------*/
789/*!
790 * \brief Classe pour effectuer une réduction 'min'.
791 */
792template <typename DataType>
793class ReducerMin2
794: public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
795{
796 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
797
798 public:
799
800 explicit ReducerMin2(RunCommand& command)
801 : BaseClass(command)
802 {}
803
804 public:
805
806 ARCCORE_HOST_DEVICE void combine(DataType v)
807 {
808 DataType& lv = this->m_local_value;
809 lv = v < lv ? v : lv;
810 }
811};
812
813/*---------------------------------------------------------------------------*/
814/*---------------------------------------------------------------------------*/
815/*!
816 * \brief Classe pour gérer les arguments de type HostDeviceReducer2 en
817 * début et fin d'exécution des noyaux.
818 */
820{
821 public:
822
823 template <typename DataType, typename ReduceFunctor>
824 static void
825 execWorkItemAtBeginForHost(HostDeviceReducer2<DataType, ReduceFunctor>&)
826 {
827 }
828 template <typename DataType, typename ReduceFunctor>
829 static void
830 execWorkItemAtEndForHost(HostDeviceReducer2<DataType, ReduceFunctor>& reducer)
831 {
832 reducer._finalize();
833 }
834
835 template <typename DataType, typename ReduceFunctor>
836 static ARCCORE_DEVICE void
837 execWorkItemAtBeginForCudaHip(HostDeviceReducer2<DataType, ReduceFunctor>&, Int32)
838 {
839 }
840
841 template <typename DataType, typename ReduceFunctor>
842 static ARCCORE_DEVICE void
843 execWorkItemAtEndForCudaHip(HostDeviceReducer2<DataType, ReduceFunctor>& reducer, Int32)
844 {
845 reducer._finalize();
846 }
847
848#if defined(ARCANE_COMPILING_SYCL)
849 template <typename DataType, typename ReduceFunctor>
850 static void
851 execWorkItemAtBeginForSycl(HostDeviceReducer2<DataType, ReduceFunctor>&, sycl::nd_item<1>)
852 {
853 }
854 template <typename DataType, typename ReduceFunctor>
855 static void
856 execWorkItemAtEndForSycl(HostDeviceReducer2<DataType, ReduceFunctor>& reducer, sycl::nd_item<1> id)
857 {
858 reducer._internalExecWorkItemAtEnd(id);
859 }
860#endif
861};
862
863/*---------------------------------------------------------------------------*/
864/*---------------------------------------------------------------------------*/
865
866} // End namespace Arcane::Accelerator
867
868/*---------------------------------------------------------------------------*/
869/*---------------------------------------------------------------------------*/
870// Cette macro est définie si on souhaite rendre inline l'implémentation.
871// Dans l'idéal il ne faut pas que ce soit le cas (ce qui permettrait de
872// changer l'implémentation sans tout recompiler) mais cela ne semble pas
873// bien fonctionner pour l'instant.
874
875#define ARCANE_INLINE_REDUCE_IMPL
876
877#ifdef ARCANE_INLINE_REDUCE_IMPL
878
879# ifndef ARCANE_INLINE_REDUCE
880# define ARCANE_INLINE_REDUCE inline
881# endif
882
883#if defined(__CUDACC__) || defined(__HIP__)
884# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
885#else
886
887#endif
888
889#endif
890
891/*---------------------------------------------------------------------------*/
892/*---------------------------------------------------------------------------*/
893
894#endif
895
896/*---------------------------------------------------------------------------*/
897/*---------------------------------------------------------------------------*/
898
899#include "arcane/accelerator/GenericReducer.h"
900
901/*---------------------------------------------------------------------------*/
902/*---------------------------------------------------------------------------*/
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
Version 2 de la réduction.
Definition Reduce.h:514
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
Definition Reduce.h:379
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
Definition Reduce.h:379
DataType _reduce()
Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
Definition Reduce.h:401
Version 1 de la réduction.
Definition Reduce.h:477
Classe pour gérer les arguments de type HostDeviceReducer2 en début et fin d'exécution des noyaux.
Definition Reduce.h:820
Implémentation de la réduction pour le backend SYCL.
Definition Reduce.h:604
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
Definition Reduce.h:105
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
Definition Reduce.h:107
DataType m_current_value
Valeur du thread courant à réduire.
Definition Reduce.h:101
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
Definition Reduce.h:109
DataType m_identity
Valeur de l'identité pour la réduction.
Definition Reduce.h:103
Vue d'un tableau d'éléments de type T.
Definition Span.h:801
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.