Arcane  v3.15.3.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
Reduce.h
Aller à la documentation de ce fichier.
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Reduce.h (C) 2000-2025 */
9/* */
10/* Gestion des réductions pour les accélérateurs. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
19
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
25
26#include <limits.h>
27#include <float.h>
28#include <atomic>
29#include <iostream>
30
31/*---------------------------------------------------------------------------*/
32/*---------------------------------------------------------------------------*/
33
34namespace Arcane::impl
35{
36class HostReducerHelper;
37}
38
39namespace Arcane::Accelerator::impl
40{
41class KernelReducerHelper;
42
43/*---------------------------------------------------------------------------*/
44/*---------------------------------------------------------------------------*/
45
46extern "C++" ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl*
47internalGetOrCreateReduceMemoryImpl(RunCommand* command);
48
49template <typename DataType>
51template <>
52// TODO: utiliser numeric_limits.
54{
55 public:
56
57 ARCCORE_HOST_DEVICE static constexpr double sumValue() { return 0.0; }
58 ARCCORE_HOST_DEVICE static constexpr double minValue() { return DBL_MAX; }
59 ARCCORE_HOST_DEVICE static constexpr double maxValue() { return -DBL_MAX; }
60};
61template <>
62class ReduceIdentity<Int32>
63{
64 public:
65
66 ARCCORE_HOST_DEVICE static constexpr Int32 sumValue() { return 0; }
67 ARCCORE_HOST_DEVICE static constexpr Int32 minValue() { return INT32_MAX; }
68 ARCCORE_HOST_DEVICE static constexpr Int32 maxValue() { return -INT32_MAX; }
69};
70template <>
71class ReduceIdentity<Int64>
72{
73 public:
74
75 ARCCORE_HOST_DEVICE static constexpr Int64 sumValue() { return 0; }
76 ARCCORE_HOST_DEVICE static constexpr Int64 minValue() { return INT64_MAX; }
77 ARCCORE_HOST_DEVICE static constexpr Int64 maxValue() { return -INT64_MAX; }
78};
79
80/*---------------------------------------------------------------------------*/
81/*---------------------------------------------------------------------------*/
82// L'implémentation utilisée est définie dans 'CommonCudaHipReduceImpl.h'
83
84/*---------------------------------------------------------------------------*/
85/*---------------------------------------------------------------------------*/
86/*!
87 * \internal
88 * \brief Informations pour effectuer une réduction sur un device.
89 */
90template <typename DataType>
92{
93 public:
94
95 //! Valeur du thread courant à réduire.
97 //! Valeur de l'identité pour la réduction
98 DataType m_identity;
99 //! Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
100 DataType* m_device_final_ptr = nullptr;
101 //! Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
102 void* m_host_final_ptr = nullptr;
103 //! Tableau avec une valeur par bloc pour la réduction
105 /*!
106 * Pointeur vers une zone mémoire contenant un entier pour indiquer
107 * combien il reste de blocs à réduire.
108 */
109 unsigned int* m_device_count = nullptr;
110
111 //! Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
112 bool m_use_grid_reduce = true;
113};
114
115/*---------------------------------------------------------------------------*/
116/*---------------------------------------------------------------------------*/
117
118template <typename DataType>
120
121template <>
123{
124 public:
125
126 static double apply(double* vptr, double v)
127 {
128 std::atomic_ref<double> aref(*vptr);
129 double old = aref.load(std::memory_order_consume);
130 double wanted = old + v;
131 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
132 wanted = old + v;
133 return wanted;
134 }
135};
136template <>
137class ReduceAtomicSum<Int64>
138{
139 public:
140
141 static Int64 apply(Int64* vptr, Int64 v)
142 {
143 std::atomic_ref<Int64> aref(*vptr);
144 Int64 x = aref.fetch_add(v);
145 return x + v;
146 }
147};
148template <>
149class ReduceAtomicSum<Int32>
150{
151 public:
152
153 static Int32 apply(Int32* vptr, Int32 v)
154 {
155 std::atomic_ref<Int32> aref(*vptr);
156 Int32 x = aref.fetch_add(v);
157 return x + v;
158 }
159};
160
161/*---------------------------------------------------------------------------*/
162/*---------------------------------------------------------------------------*/
163
164template <typename DataType>
166{
167 public:
168
169 static ARCCORE_DEVICE DataType
170 applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
171 {
172 _applyDevice(dev_info);
173 return *(dev_info.m_device_final_ptr);
174 }
175 static DataType apply(DataType* vptr, DataType v)
176 {
178 }
179#if defined(ARCANE_COMPILING_SYCL)
180 static sycl::plus<DataType> syclFunctor() { return {}; }
181#endif
182
183 public:
184
185 ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::sumValue(); }
186
187 private:
188
189 static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);
190};
191
192/*---------------------------------------------------------------------------*/
193/*---------------------------------------------------------------------------*/
194
195template <typename DataType>
197{
198 public:
199
200 static ARCCORE_DEVICE DataType
201 applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
202 {
203 _applyDevice(dev_info);
204 return *(dev_info.m_device_final_ptr);
205 }
206 static DataType apply(DataType* ptr, DataType v)
207 {
208 std::atomic_ref<DataType> aref(*ptr);
209 DataType prev_value = aref.load();
210 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
211 }
212 return aref.load();
213 }
214#if defined(ARCANE_COMPILING_SYCL)
215 static sycl::maximum<DataType> syclFunctor() { return {}; }
216#endif
217
218 public:
219
220 ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::maxValue(); }
221
222 private:
223
224 static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);
225};
226
227/*---------------------------------------------------------------------------*/
228/*---------------------------------------------------------------------------*/
229
230template <typename DataType>
232{
233 public:
234
235 static ARCCORE_DEVICE DataType
236 applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
237 {
238 _applyDevice(dev_info);
239 return *(dev_info.m_device_final_ptr);
240 }
241 static DataType apply(DataType* vptr, DataType v)
242 {
243 std::atomic_ref<DataType> aref(*vptr);
244 DataType prev_value = aref.load();
245 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
246 }
247 return aref.load();
248 }
249#if defined(ARCANE_COMPILING_SYCL)
250 static sycl::minimum<DataType> syclFunctor() { return {}; }
251#endif
252
253 public:
254
255 ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::minValue(); }
256
257 private:
258
259 static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);
260};
261
262/*---------------------------------------------------------------------------*/
263/*---------------------------------------------------------------------------*/
264
265} // namespace Arcane::Accelerator::impl
266
267namespace Arcane::Accelerator
268{
269
270/*---------------------------------------------------------------------------*/
271/*---------------------------------------------------------------------------*/
272
273/*!
274 * \brief Opérateur de réduction
275 *
276 * Cette classe permet de gérer une réduction sur accélérateur ou en
277 * multi-thread.
278 *
279 * La réduction finale a lieu lors de l'appel à reduce(). Il ne faut donc
280 * faire cet appel qu'une seule fois et dans une partie collective. Cet appel
281 * n'est valide que sur les instance créées avec un constructeur vide. Ces dernières
282 * ne peuvent être créées que sur l'hôte.
283 *
284 * \warning Le constructeur de recopie ne doit pas être appelé explicitement.
285 * L'instance de départ doit rester valide tant qu'il existe des copies ou
286 * des références dans le noyau de calcul.
287 *
288 * NOTE sur l'implémentation
289 *
290 * Sur GPU, les réductions sont effectuées dans le destructeur de la classe
291 * La valeur 'm_host_or_device_memory_for_reduced_value' sert à conserver ces valeurs.
292 * Sur l'hôte, on utilise un 'std::atomic' pour conserver la valeur commune
293 * entre les threads. Cette valeur est référencée par 'm_parent_value' et n'est
294 * valide que sur l'hôte.
295 */
296template <typename DataType, typename ReduceFunctor>
297class HostDeviceReducerBase
298{
299 public:
300
301 HostDeviceReducerBase(RunCommand& command)
302 : m_host_or_device_memory_for_reduced_value(&m_local_value)
303 , m_command(&command)
304 {
305 //std::cout << String::format("Reduce main host this={0}\n",this); std::cout.flush();
306 m_is_master_instance = true;
307 m_identity = ReduceFunctor::identity();
308 m_local_value = m_identity;
309 m_atomic_value = m_identity;
310 m_atomic_parent_value = &m_atomic_value;
311 //printf("Create null host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this);
312 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
313 if (m_memory_impl) {
314 m_host_or_device_memory_for_reduced_value = impl::allocateReduceDataMemory<DataType>(m_memory_impl, m_identity);
315 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
316 }
317 }
318
319#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
320 HostDeviceReducerBase(const HostDeviceReducerBase& rhs) = default;
321#else
322 ARCCORE_HOST_DEVICE HostDeviceReducerBase(const HostDeviceReducerBase& rhs)
323 : m_host_or_device_memory_for_reduced_value(rhs.m_host_or_device_memory_for_reduced_value)
324 , m_local_value(rhs.m_local_value)
325 , m_identity(rhs.m_identity)
326 {
327#ifdef ARCCORE_DEVICE_CODE
328 m_grid_memory_info = rhs.m_grid_memory_info;
329 //int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;
330 //if (threadId==0)
331 //printf("Create ref device Id=%d parent=%p\n",threadId,&rhs);
332#else
333 m_memory_impl = rhs.m_memory_impl;
334 if (m_memory_impl) {
335 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
336 }
337 //std::cout << String::format("Reduce: host copy this={0} rhs={1} mem={2} device_count={3}\n",this,&rhs,m_memory_impl,(void*)m_grid_device_count);
338 m_atomic_parent_value = rhs.m_atomic_parent_value;
339 m_local_value = rhs.m_identity;
340 m_atomic_value = m_identity;
341 //std::cout << String::format("Reduce copy host this={0} parent_value={1} rhs={2}\n",this,(void*)m_parent_value,&rhs); std::cout.flush();
342 //if (!rhs.m_is_master_instance)
343 //ARCANE_FATAL("Only copy from master instance is allowed");
344 //printf("Create ref host parent_value=%p this=%p rhs=%p\n",(void*)m_parent_value,(void*)this,(void*)&rhs);
345#endif
346 }
347#endif
348
349 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) = delete;
350 HostDeviceReducerBase& operator=(const HostDeviceReducerBase& rhs) = delete;
351
352 public:
353
354 ARCCORE_HOST_DEVICE void setValue(DataType v)
355 {
356 m_local_value = v;
357 }
358 ARCCORE_HOST_DEVICE DataType localValue() const
359 {
360 return m_local_value;
361 }
362
363 protected:
364
365 impl::IReduceMemoryImpl* m_memory_impl = nullptr;
366 /*!
367 * \brief Pointeur vers la donnée qui contiendra la valeur réduite.
368 *
369 * Sur accélérateur, cette donnée est allouée sur le device.
370 * Sur CPU, il s'agit de l'adresse de \a m_local_value pour l'instance parente.
371 */
372 DataType* m_host_or_device_memory_for_reduced_value = nullptr;
373 impl::IReduceMemoryImpl::GridMemoryInfo m_grid_memory_info;
374
375 private:
376
377 RunCommand* m_command = nullptr;
378
379 protected:
380
381 mutable DataType m_local_value;
382 DataType* m_atomic_parent_value = nullptr;
383 mutable DataType m_atomic_value;
384
385 private:
386
387 DataType m_identity;
388 //bool m_is_allocated = false;
389 bool m_is_master_instance = false;
390
391 protected:
392
393 //! Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
394 DataType _reduce()
395 {
396 if (!m_is_master_instance)
397 ARCANE_FATAL("Final reduce operation is only valid on master instance");
398 // Si la réduction est faite sur accélérateur, il faut recopier la valeur du device sur l'hôte.
399 DataType* final_ptr = m_host_or_device_memory_for_reduced_value;
400 if (m_memory_impl) {
401 m_memory_impl->copyReduceValueFromDevice();
402 final_ptr = reinterpret_cast<DataType*>(m_grid_memory_info.m_host_memory_for_reduced_value);
403 m_memory_impl->release();
404 m_memory_impl = nullptr;
405 }
406
407 if (m_atomic_parent_value) {
408 //std::cout << String::format("Reduce host has parent this={0} local_value={1} parent_value={2}\n",
409 // this,m_local_value,*m_parent_value);
410 //std::cout.flush();
411 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
412 *final_ptr = *m_atomic_parent_value;
413 }
414 else {
415 //std::cout << String::format("Reduce host no parent this={0} local_value={1} managed={2}\n",
416 // this,m_local_value,*m_host_or_device_memory_for_reduced_value);
417 //std::cout.flush();
418 }
419 return *final_ptr;
420 }
421
422 // NOTE: Lorsqu'il n'y aura plus la version V1 de la réduction, cette méthode ne sera
423 // appelée que depuis le device.
424 ARCCORE_HOST_DEVICE void
425 _finalize()
426 {
427#ifdef ARCCORE_DEVICE_CODE
428 //int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;
429 //if ((threadId%16)==0)
430 //printf("Destroy device Id=%d\n",threadId);
431 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
432 DataType* buf = reinterpret_cast<DataType*>(buf_span.data());
433 SmallSpan<DataType> grid_buffer(buf, static_cast<Int32>(buf_span.size()));
434
435 impl::ReduceDeviceInfo<DataType> dvi;
436 dvi.m_grid_buffer = grid_buffer;
437 dvi.m_device_count = m_grid_memory_info.m_grid_device_count;
438 dvi.m_device_final_ptr = m_host_or_device_memory_for_reduced_value;
439 dvi.m_host_final_ptr = m_grid_memory_info.m_host_memory_for_reduced_value;
440 dvi.m_current_value = m_local_value;
441 dvi.m_identity = m_identity;
442 dvi.m_use_grid_reduce = m_grid_memory_info.m_reduce_policy != eDeviceReducePolicy::Atomic;
443 ReduceFunctor::applyDevice(dvi); //grid_buffer,m_grid_device_count,m_host_or_device_memory_for_reduced_value,m_local_value,m_identity);
444#else
445 // printf("Destroy host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this);
446 // Code hôte
447 //std::cout << String::format("Reduce destructor this={0} parent_value={1} v={2} memory_impl={3}\n",this,(void*)m_parent_value,m_local_value,m_memory_impl);
448 //std::cout << String::format("Reduce destructor this={0} grid_data={1} grid_size={2}\n",
449 // this,(void*)m_grid_memory_value_as_bytes,m_grid_memory_size);
450 //std::cout.flush();
451 if (!m_is_master_instance)
452 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
453
454 //printf("Destroy host %p %p\n",m_host_or_device_memory_for_reduced_value,this);
455#endif
456 }
457};
458
459/*---------------------------------------------------------------------------*/
460/*---------------------------------------------------------------------------*/
461/*!
462 * \brief Version 1 de la réduction.
463 *
464 * Cette version est obsolète. Elle utilise le destructeur de la classe
465 * pour effectuer la réduction.
466 */
467template <typename DataType, typename ReduceFunctor>
468class HostDeviceReducer
469: public HostDeviceReducerBase<DataType, ReduceFunctor>
470{
471 public:
472
473 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
474
475 public:
476
477 explicit HostDeviceReducer(RunCommand& command)
478 : BaseClass(command)
479 {}
480 HostDeviceReducer(const HostDeviceReducer& rhs) = default;
481 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
482 {
483 this->_finalize();
484 }
485
486 public:
487
488 DataType reduce()
489 {
490 return this->_reduce();
491 }
492
493 DataType reducedValue()
494 {
495 return this->_reduce();
496 }
497};
498
499/*---------------------------------------------------------------------------*/
500/*---------------------------------------------------------------------------*/
501/*!
502 * \brief Version 2 de la réduction.
503 */
504template <typename DataType, typename ReduceFunctor>
505class HostDeviceReducer2
506: public HostDeviceReducerBase<DataType, ReduceFunctor>
507{
508 friend impl::KernelRemainingArgsHelper;
509 friend ::Arcane::impl::HostReducerHelper;
510
511 public:
512
513 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
514 using BaseClass::m_grid_memory_info;
515 using BaseClass::m_host_or_device_memory_for_reduced_value;
516 using BaseClass::m_local_value;
517
518 public:
519
520 explicit HostDeviceReducer2(RunCommand& command)
521 : BaseClass(command)
522 {}
523
524 public:
525
526 DataType reducedValue()
527 {
528 return this->_reduce();
529 }
530
531 private:
532
533 // Note: les méthodes _internalReduce...() sont
534 // internes à Arcane.
535
536 void _internalReduceHost()
537 {
538 this->_finalize();
539 }
540
541#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
542 ARCCORE_HOST_DEVICE void _internalExecWorkItem(Int32)
543 {
544 this->_finalize();
545 };
546#endif
547
548#if defined(ARCANE_COMPILING_SYCL)
549 void _internalExecWorkItem(sycl::nd_item<1> id)
550 {
551 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
552 const Int32 local_id = static_cast<Int32>(id.get_local_id(0));
553 const Int32 group_id = static_cast<Int32>(id.get_group_linear_id());
554 const Int32 nb_block = static_cast<Int32>(id.get_group_range(0));
555
556 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
557 DataType* buf = reinterpret_cast<DataType*>(buf_span.data());
558 SmallSpan<DataType> grid_buffer(buf, static_cast<Int32>(buf_span.size()));
559
560 DataType v = m_local_value;
561 bool is_last = false;
562 auto sycl_functor = ReduceFunctor::syclFunctor();
563 DataType local_sum = sycl::reduce_over_group(id.get_group(), v, sycl_functor);
564 if (local_id == 0) {
565 grid_buffer[group_id] = local_sum;
566
567 // TODO: En théorie il faut faire l'équivalent d'un __threadfence() ici
568 // pour garantir que les autres work-item voient bien la mise à jour de 'grid_buffer'.
569 // Mais ce mécanisme n'existe pas avec SYCL 2020.
570
571 // AdaptiveCpp 2024.2 ne supporte pas les opérations atomiques sur 'unsigned int'.
572 // Elles sont supportées avec le type 'int'. Comme on est certain de ne pas dépasser 2^31, on
573 // converti le pointeur en un 'int*'.
574#if defined(__ADAPTIVECPP__)
575 int* atomic_counter_ptr_as_int = reinterpret_cast<int*>(atomic_counter_ptr);
576 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
577#else
578 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
579#endif
580 Int32 cx = a.fetch_add(1);
581 if (cx == (nb_block - 1))
582 is_last = true;
583 }
584
585 // Je suis le dernier à faire la réduction.
586 // Calcule la réduction finale
587 if (is_last) {
588 DataType my_total = grid_buffer[0];
589 for (int x = 1; x < nb_block; ++x)
590 my_total = sycl_functor(my_total, grid_buffer[x]);
591 // Met le résultat final dans le premier élément du tableau.
592 grid_buffer[0] = my_total;
593 *m_host_or_device_memory_for_reduced_value = my_total;
594 *atomic_counter_ptr = 0;
595 }
596 }
597#endif
598};
599
600/*---------------------------------------------------------------------------*/
601/*---------------------------------------------------------------------------*/
602/*!
603 * \brief Implémentation de la réduction pour le backend SYCL.
604 *
605 * \warning Pour l'instant il n'y aucune implémentation. Cette classe permet
606 * juste la compilation.
607 */
608template <typename DataType, typename ReduceFunctor>
609class SyclReducer
610{
611 public:
612
613 explicit SyclReducer(RunCommand&) {}
614
615 public:
616
617 DataType reduce()
618 {
619 return m_local_value;
620 }
621 void setValue(DataType v) { m_local_value = v; }
622
623 protected:
624
625 mutable DataType m_local_value = {};
626};
627
628/*---------------------------------------------------------------------------*/
629/*---------------------------------------------------------------------------*/
630
631#if defined(ARCANE_COMPILING_SYCL)
632template <typename DataType, typename ReduceFunctor> using Reducer = SyclReducer<DataType, ReduceFunctor>;
633#else
634template <typename DataType, typename ReduceFunctor> using Reducer = HostDeviceReducer<DataType, ReduceFunctor>;
635#endif
636
637/*---------------------------------------------------------------------------*/
638/*---------------------------------------------------------------------------*/
639
640/*---------------------------------------------------------------------------*/
641/*---------------------------------------------------------------------------*/
642/*!
643 * \brief Classe pour effectuer une réduction 'somme'.
644 */
645template <typename DataType>
646class ReducerSum
647: public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
648{
649 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
650 using BaseClass::m_local_value;
651
652 public:
653
654 explicit ReducerSum(RunCommand& command)
655 : BaseClass(command)
656 {}
657
658 public:
659
660 ARCCORE_HOST_DEVICE DataType combine(DataType v) const
661 {
662 m_local_value += v;
663 return m_local_value;
664 }
665
666 ARCCORE_HOST_DEVICE DataType add(DataType v) const
667 {
668 return combine(v);
669 }
670};
671
672/*---------------------------------------------------------------------------*/
673/*---------------------------------------------------------------------------*/
674/*!
675 * \brief Classe pour effectuer une réduction 'max'.
676 */
677template <typename DataType>
678class ReducerMax
679: public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
680{
681 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
682 using BaseClass::m_local_value;
683
684 public:
685
686 explicit ReducerMax(RunCommand& command)
687 : BaseClass(command)
688 {}
689
690 public:
691
692 ARCCORE_HOST_DEVICE DataType combine(DataType v) const
693 {
694 m_local_value = v > m_local_value ? v : m_local_value;
695 return m_local_value;
696 }
697
698 ARCCORE_HOST_DEVICE DataType max(DataType v) const
699 {
700 return combine(v);
701 }
702};
703
704/*---------------------------------------------------------------------------*/
705/*---------------------------------------------------------------------------*/
706
707/*---------------------------------------------------------------------------*/
708/*---------------------------------------------------------------------------*/
709/*!
710 * \brief Classe pour effectuer une réduction 'min'.
711 */
712template <typename DataType>
713class ReducerMin
714: public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
715{
716 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
717 using BaseClass::m_local_value;
718
719 public:
720
721 explicit ReducerMin(RunCommand& command)
722 : BaseClass(command)
723 {}
724
725 public:
726
727 ARCCORE_HOST_DEVICE DataType combine(DataType v) const
728 {
729 m_local_value = v < m_local_value ? v : m_local_value;
730 return m_local_value;
731 }
732
733 ARCCORE_HOST_DEVICE DataType min(DataType v) const
734 {
735 return combine(v);
736 }
737};
738
739/*---------------------------------------------------------------------------*/
740/*---------------------------------------------------------------------------*/
741
742/*---------------------------------------------------------------------------*/
743/*---------------------------------------------------------------------------*/
744/*!
745 * \brief Classe pour effectuer une réduction 'somme'.
746 */
747template <typename DataType>
748class ReducerSum2
749: public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
750{
751 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
752
753 public:
754
755 explicit ReducerSum2(RunCommand& command)
756 : BaseClass(command)
757 {}
758
759 public:
760
761 ARCCORE_HOST_DEVICE void combine(DataType v)
762 {
763 this->m_local_value += v;
764 }
765};
766
767/*---------------------------------------------------------------------------*/
768/*---------------------------------------------------------------------------*/
769/*!
770 * \brief Classe pour effectuer une réduction 'max'.
771 */
772template <typename DataType>
773class ReducerMax2
774: public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
775{
776 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
777
778 public:
779
780 explicit ReducerMax2(RunCommand& command)
781 : BaseClass(command)
782 {}
783
784 public:
785
786 ARCCORE_HOST_DEVICE void combine(DataType v)
787 {
788 DataType& lv = this->m_local_value;
789 lv = v > lv ? v : lv;
790 }
791};
792
793/*---------------------------------------------------------------------------*/
794/*---------------------------------------------------------------------------*/
795/*!
796 * \brief Classe pour effectuer une réduction 'min'.
797 */
798template <typename DataType>
799class ReducerMin2
800: public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
801{
802 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
803
804 public:
805
806 explicit ReducerMin2(RunCommand& command)
807 : BaseClass(command)
808 {}
809
810 public:
811
812 ARCCORE_HOST_DEVICE void combine(DataType v)
813 {
814 DataType& lv = this->m_local_value;
815 lv = v < lv ? v : lv;
816 }
817};
818
819/*---------------------------------------------------------------------------*/
820/*---------------------------------------------------------------------------*/
821
822} // End namespace Arcane::Accelerator
823
824/*---------------------------------------------------------------------------*/
825/*---------------------------------------------------------------------------*/
826// Cette macro est définie si on souhaite rendre inline l'implémentation.
827// Dans l'idéal il ne faut pas que ce soit le cas (ce qui permettrait de
828// changer l'implémentation sans tout recompiler) mais cela ne semble pas
829// bien fonctionner pour l'instant.
830
831#define ARCANE_INLINE_REDUCE_IMPL
832
833#ifdef ARCANE_INLINE_REDUCE_IMPL
834
835# ifndef ARCANE_INLINE_REDUCE
836# define ARCANE_INLINE_REDUCE inline
837# endif
838
839#if defined(__CUDACC__) || defined(__HIP__)
840# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
841#else
842
843#endif
844
845#endif
846
847/*---------------------------------------------------------------------------*/
848/*---------------------------------------------------------------------------*/
849
850#endif
851
852/*---------------------------------------------------------------------------*/
853/*---------------------------------------------------------------------------*/
854
855#include "arcane/accelerator/GenericReducer.h"
856
857/*---------------------------------------------------------------------------*/
858/*---------------------------------------------------------------------------*/
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
Gestion d'une commande sur accélérateur.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
Definition Reduce.h:100
bool m_use_grid_reduce
Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
Definition Reduce.h:112
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
Definition Reduce.h:102
DataType m_current_value
Valeur du thread courant à réduire.
Definition Reduce.h:96
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
Definition Reduce.h:104
DataType m_identity
Valeur de l'identité pour la réduction.
Definition Reduce.h:98
Référence à une instance.
__host__ __device__ Real2 min(Real2 a, Real2 b)
Retourne le minimum de deux Real2.
Definition MathUtils.h:336
T max(const T &a, const T &b, const T &c)
Retourne le maximum de trois éléments.
Definition MathUtils.h:392
Espace de nom pour l'utilisation des accélérateurs.
@ Atomic
Utilise des opérations atomiques entre les blocs.
void add(ArrayView< T > lhs, ConstArrayView< T > copy_array)
Ajoute le tableau copy_array dans l'instance.
Definition MathUtils.h:885