12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
36class HostReducerHelper;
39namespace Arcane::Accelerator::impl
41class KernelReducerHelper;
46extern "C++" ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl*
47internalGetOrCreateReduceMemoryImpl(RunCommand* command);
49template <
typename DataType>
90template <
typename DataType>
118template <
typename DataType>
126 static double apply(
double*
vptr,
double v)
128 std::atomic_ref<double>
aref(*
vptr);
129 double old =
aref.load(std::memory_order_consume);
131 while (!
aref.compare_exchange_weak(
old,
wanted, std::memory_order_release, std::memory_order_consume))
141 static Int64 apply(Int64*
vptr, Int64 v)
144 Int64 x =
aref.fetch_add(v);
153 static Int32 apply(Int32*
vptr, Int32 v)
156 Int32 x =
aref.fetch_add(v);
164template <
typename DataType>
169 static ARCCORE_DEVICE DataType
173 return *(
dev_info.m_device_final_ptr);
175 static DataType apply(DataType*
vptr, DataType v)
179#if defined(ARCANE_COMPILING_SYCL)
180 static sycl::plus<DataType>
syclFunctor() {
return {}; }
195template <
typename DataType>
200 static ARCCORE_DEVICE DataType
204 return *(
dev_info.m_device_final_ptr);
206 static DataType apply(DataType*
ptr, DataType v)
208 std::atomic_ref<DataType>
aref(*
ptr);
214#if defined(ARCANE_COMPILING_SYCL)
215 static sycl::maximum<DataType>
syclFunctor() {
return {}; }
230template <
typename DataType>
235 static ARCCORE_DEVICE DataType
239 return *(
dev_info.m_device_final_ptr);
241 static DataType apply(DataType*
vptr, DataType v)
243 std::atomic_ref<DataType>
aref(*
vptr);
249#if defined(ARCANE_COMPILING_SYCL)
250 static sycl::minimum<DataType>
syclFunctor() {
return {}; }
296template <
typename DataType,
typename ReduceFunctor>
297class HostDeviceReducerBase
302 : m_host_or_device_memory_for_reduced_value(&m_local_value)
303 , m_command(&command)
306 m_is_master_instance =
true;
307 m_identity = ReduceFunctor::identity();
308 m_local_value = m_identity;
309 m_atomic_value = m_identity;
310 m_atomic_parent_value = &m_atomic_value;
312 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
314 m_host_or_device_memory_for_reduced_value = impl::allocateReduceDataMemory<DataType>(m_memory_impl, m_identity);
315 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
319#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
320 HostDeviceReducerBase(
const HostDeviceReducerBase& rhs) =
default;
322 ARCCORE_HOST_DEVICE HostDeviceReducerBase(
const HostDeviceReducerBase& rhs)
323 : m_host_or_device_memory_for_reduced_value(rhs.m_host_or_device_memory_for_reduced_value)
324 , m_local_value(rhs.m_local_value)
325 , m_identity(rhs.m_identity)
327#ifdef ARCCORE_DEVICE_CODE
328 m_grid_memory_info = rhs.m_grid_memory_info;
333 m_memory_impl = rhs.m_memory_impl;
335 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
338 m_atomic_parent_value = rhs.m_atomic_parent_value;
339 m_local_value = rhs.m_identity;
340 m_atomic_value = m_identity;
349 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) =
delete;
350 HostDeviceReducerBase& operator=(
const HostDeviceReducerBase& rhs) =
delete;
354 ARCCORE_HOST_DEVICE
void setValue(DataType v)
358 ARCCORE_HOST_DEVICE DataType localValue()
const
360 return m_local_value;
365 impl::IReduceMemoryImpl* m_memory_impl =
nullptr;
372 DataType* m_host_or_device_memory_for_reduced_value =
nullptr;
373 impl::IReduceMemoryImpl::GridMemoryInfo m_grid_memory_info;
377 RunCommand* m_command =
nullptr;
381 mutable DataType m_local_value;
382 DataType* m_atomic_parent_value =
nullptr;
383 mutable DataType m_atomic_value;
389 bool m_is_master_instance =
false;
396 if (!m_is_master_instance)
397 ARCANE_FATAL(
"Final reduce operation is only valid on master instance");
399 DataType* final_ptr = m_host_or_device_memory_for_reduced_value;
401 m_memory_impl->copyReduceValueFromDevice();
402 final_ptr =
reinterpret_cast<DataType*
>(m_grid_memory_info.m_host_memory_for_reduced_value);
403 m_memory_impl->release();
404 m_memory_impl =
nullptr;
407 if (m_atomic_parent_value) {
411 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
412 *final_ptr = *m_atomic_parent_value;
424 ARCCORE_HOST_DEVICE
void
427#ifdef ARCCORE_DEVICE_CODE
431 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
432 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
433 SmallSpan<DataType> grid_buffer(buf,
static_cast<Int32>(buf_span.size()));
435 impl::ReduceDeviceInfo<DataType> dvi;
436 dvi.m_grid_buffer = grid_buffer;
437 dvi.m_device_count = m_grid_memory_info.m_grid_device_count;
438 dvi.m_device_final_ptr = m_host_or_device_memory_for_reduced_value;
439 dvi.m_host_final_ptr = m_grid_memory_info.m_host_memory_for_reduced_value;
440 dvi.m_current_value = m_local_value;
441 dvi.m_identity = m_identity;
443 ReduceFunctor::applyDevice(dvi);
451 if (!m_is_master_instance)
452 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
467template <
typename DataType,
typename ReduceFunctor>
468class HostDeviceReducer
469:
public HostDeviceReducerBase<DataType, ReduceFunctor>
473 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
477 explicit HostDeviceReducer(RunCommand& command)
480 HostDeviceReducer(
const HostDeviceReducer& rhs) =
default;
481 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
490 return this->_reduce();
493 DataType reducedValue()
495 return this->_reduce();
504template <
typename DataType,
typename ReduceFunctor>
505class HostDeviceReducer2
506:
public HostDeviceReducerBase<DataType, ReduceFunctor>
508 friend impl::KernelRemainingArgsHelper;
509 friend ::Arcane::impl::HostReducerHelper;
513 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
514 using BaseClass::m_grid_memory_info;
515 using BaseClass::m_host_or_device_memory_for_reduced_value;
516 using BaseClass::m_local_value;
520 explicit HostDeviceReducer2(RunCommand& command)
526 DataType reducedValue()
528 return this->_reduce();
536 void _internalReduceHost()
541#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
542 ARCCORE_HOST_DEVICE
void _internalExecWorkItem(
Int32)
548#if defined(ARCANE_COMPILING_SYCL)
549 void _internalExecWorkItem(sycl::nd_item<1>
id)
551 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
552 const Int32 local_id =
static_cast<Int32>(
id.get_local_id(0));
553 const Int32 group_id =
static_cast<Int32>(
id.get_group_linear_id());
554 const Int32 nb_block =
static_cast<Int32>(
id.get_group_range(0));
556 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
557 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
558 SmallSpan<DataType> grid_buffer(buf,
static_cast<Int32>(buf_span.size()));
560 DataType v = m_local_value;
561 bool is_last =
false;
562 auto sycl_functor = ReduceFunctor::syclFunctor();
563 DataType local_sum = sycl::reduce_over_group(
id.get_group(), v, sycl_functor);
565 grid_buffer[group_id] = local_sum;
574#if defined(__ADAPTIVECPP__)
575 int* atomic_counter_ptr_as_int =
reinterpret_cast<int*
>(atomic_counter_ptr);
576 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
578 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
580 Int32 cx = a.fetch_add(1);
581 if (cx == (nb_block - 1))
588 DataType my_total = grid_buffer[0];
589 for (
int x = 1; x < nb_block; ++x)
590 my_total = sycl_functor(my_total, grid_buffer[x]);
592 grid_buffer[0] = my_total;
593 *m_host_or_device_memory_for_reduced_value = my_total;
594 *atomic_counter_ptr = 0;
608template <
typename DataType,
typename ReduceFunctor>
613 explicit SyclReducer(RunCommand&) {}
619 return m_local_value;
621 void setValue(DataType v) { m_local_value = v; }
625 mutable DataType m_local_value = {};
631#if defined(ARCANE_COMPILING_SYCL)
632template <
typename DataType,
typename ReduceFunctor>
using Reducer = SyclReducer<DataType, ReduceFunctor>;
634template <
typename DataType,
typename ReduceFunctor>
using Reducer = HostDeviceReducer<DataType, ReduceFunctor>;
645template <
typename DataType>
647:
public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
649 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
650 using BaseClass::m_local_value;
654 explicit ReducerSum(RunCommand& command)
660 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
663 return m_local_value;
666 ARCCORE_HOST_DEVICE DataType
add(DataType v)
const
677template <
typename DataType>
679:
public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
681 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
682 using BaseClass::m_local_value;
686 explicit ReducerMax(RunCommand& command)
692 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
694 m_local_value = v > m_local_value ? v : m_local_value;
695 return m_local_value;
698 ARCCORE_HOST_DEVICE DataType
max(DataType v)
const
712template <
typename DataType>
714:
public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
716 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
717 using BaseClass::m_local_value;
721 explicit ReducerMin(RunCommand& command)
727 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
729 m_local_value = v < m_local_value ? v : m_local_value;
730 return m_local_value;
733 ARCCORE_HOST_DEVICE DataType
min(DataType v)
const
747template <
typename DataType>
749:
public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
751 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
755 explicit ReducerSum2(RunCommand& command)
761 ARCCORE_HOST_DEVICE
void combine(DataType v)
763 this->m_local_value += v;
772template <
typename DataType>
774:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
776 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
780 explicit ReducerMax2(RunCommand& command)
786 ARCCORE_HOST_DEVICE
void combine(DataType v)
788 DataType& lv = this->m_local_value;
789 lv = v > lv ? v : lv;
798template <
typename DataType>
800:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
802 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
806 explicit ReducerMin2(RunCommand& command)
812 ARCCORE_HOST_DEVICE
void combine(DataType v)
814 DataType& lv = this->m_local_value;
815 lv = v < lv ? v : lv;
831#define ARCANE_INLINE_REDUCE_IMPL
833#ifdef ARCANE_INLINE_REDUCE_IMPL
835# ifndef ARCANE_INLINE_REDUCE
836# define ARCANE_INLINE_REDUCE inline
839#if defined(__CUDACC__) || defined(__HIP__)
840# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
855#include "arcane/accelerator/GenericReducer.h"
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
Gestion d'une commande sur accélérateur.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
unsigned int * m_device_count
bool m_use_grid_reduce
Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
DataType m_current_value
Valeur du thread courant à réduire.
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
DataType m_identity
Valeur de l'identité pour la réduction.
Référence à une instance.
__host__ __device__ Real2 min(Real2 a, Real2 b)
Retourne le minimum de deux Real2.
T max(const T &a, const T &b, const T &c)
Retourne le maximum de trois éléments.
Espace de nom pour l'utilisation des accélérateurs.
@ Atomic
Utilise des opérations atomiques entre les blocs.
void add(ArrayView< T > lhs, ConstArrayView< T > copy_array)
Ajoute le tableau copy_array dans l'instance.