12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
36class HostReducerHelper;
39namespace Arcane::Accelerator::impl
41class KernelReducerHelper;
51extern "C++" ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl*
52internalGetOrCreateReduceMemoryImpl(RunCommand* command);
54template <
typename DataType>
62 ARCCORE_HOST_DEVICE
static constexpr double sumValue() {
return 0.0; }
63 ARCCORE_HOST_DEVICE
static constexpr double minValue() {
return DBL_MAX; }
64 ARCCORE_HOST_DEVICE
static constexpr double maxValue() {
return -DBL_MAX; }
71 ARCCORE_HOST_DEVICE
static constexpr Int32 sumValue() {
return 0; }
72 ARCCORE_HOST_DEVICE
static constexpr Int32 minValue() {
return INT32_MAX; }
73 ARCCORE_HOST_DEVICE
static constexpr Int32 maxValue() {
return -INT32_MAX; }
80 ARCCORE_HOST_DEVICE
static constexpr Int64 sumValue() {
return 0; }
81 ARCCORE_HOST_DEVICE
static constexpr Int64 minValue() {
return INT64_MAX; }
82 ARCCORE_HOST_DEVICE
static constexpr Int64 maxValue() {
return -INT64_MAX; }
95template <
typename DataType>
123template <
typename DataType>
131 static double apply(
double* vptr,
double v)
133 std::atomic_ref<double> aref(*vptr);
134 double old = aref.load(std::memory_order_consume);
135 double wanted = old + v;
136 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
148 std::atomic_ref<Int64> aref(*vptr);
149 Int64 x = aref.fetch_add(v);
160 std::atomic_ref<Int32> aref(*vptr);
161 Int32 x = aref.fetch_add(v);
169template <
typename DataType>
174 static ARCCORE_DEVICE DataType
177 _applyDevice(dev_info);
180 static DataType apply(DataType* vptr, DataType v)
184#if defined(ARCANE_COMPILING_SYCL)
185 static sycl::plus<DataType> syclFunctor() {
return {}; }
200template <
typename DataType>
205 static ARCCORE_DEVICE DataType
208 _applyDevice(dev_info);
211 static DataType apply(DataType* ptr, DataType v)
213 std::atomic_ref<DataType> aref(*ptr);
214 DataType prev_value = aref.load();
215 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
219#if defined(ARCANE_COMPILING_SYCL)
220 static sycl::maximum<DataType> syclFunctor() {
return {}; }
235template <
typename DataType>
240 static ARCCORE_DEVICE DataType
243 _applyDevice(dev_info);
246 static DataType apply(DataType* vptr, DataType v)
248 std::atomic_ref<DataType> aref(*vptr);
249 DataType prev_value = aref.load();
250 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
254#if defined(ARCANE_COMPILING_SYCL)
255 static sycl::minimum<DataType> syclFunctor() {
return {}; }
301template <
typename DataType,
typename ReduceFunctor>
302class HostDeviceReducerBase
307 : m_host_or_device_memory_for_reduced_value(&m_local_value)
308 , m_command(&command)
311 m_is_master_instance =
true;
312 m_identity = ReduceFunctor::identity();
313 m_local_value = m_identity;
314 m_atomic_value = m_identity;
315 m_atomic_parent_value = &m_atomic_value;
317 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
319 m_host_or_device_memory_for_reduced_value = impl::allocateReduceDataMemory<DataType>(m_memory_impl, m_identity);
320 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
324#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
325 HostDeviceReducerBase(
const HostDeviceReducerBase& rhs) =
default;
327 ARCCORE_HOST_DEVICE HostDeviceReducerBase(
const HostDeviceReducerBase& rhs)
328 : m_host_or_device_memory_for_reduced_value(rhs.m_host_or_device_memory_for_reduced_value)
329 , m_local_value(rhs.m_local_value)
330 , m_identity(rhs.m_identity)
332#ifdef ARCCORE_DEVICE_CODE
333 m_grid_memory_info = rhs.m_grid_memory_info;
338 m_memory_impl = rhs.m_memory_impl;
340 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
343 m_atomic_parent_value = rhs.m_atomic_parent_value;
344 m_local_value = rhs.m_identity;
345 m_atomic_value = m_identity;
354 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) =
delete;
355 HostDeviceReducerBase& operator=(
const HostDeviceReducerBase& rhs) =
delete;
359 ARCCORE_HOST_DEVICE
void setValue(DataType v)
363 ARCCORE_HOST_DEVICE DataType localValue()
const
365 return m_local_value;
370 impl::IReduceMemoryImpl* m_memory_impl =
nullptr;
377 DataType* m_host_or_device_memory_for_reduced_value =
nullptr;
378 impl::IReduceMemoryImpl::GridMemoryInfo m_grid_memory_info;
382 RunCommand* m_command =
nullptr;
386 mutable DataType m_local_value;
387 DataType* m_atomic_parent_value =
nullptr;
388 mutable DataType m_atomic_value;
394 bool m_is_master_instance =
false;
401 if (!m_is_master_instance)
402 ARCANE_FATAL(
"Final reduce operation is only valid on master instance");
404 DataType* final_ptr = m_host_or_device_memory_for_reduced_value;
406 m_memory_impl->copyReduceValueFromDevice();
407 final_ptr =
reinterpret_cast<DataType*
>(m_grid_memory_info.m_host_memory_for_reduced_value);
408 m_memory_impl->release();
409 m_memory_impl =
nullptr;
412 if (m_atomic_parent_value) {
416 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
417 *final_ptr = *m_atomic_parent_value;
429 ARCCORE_HOST_DEVICE
void
432#ifdef ARCCORE_DEVICE_CODE
436 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
437 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
440 impl::ReduceDeviceInfo<DataType> dvi;
441 dvi.m_grid_buffer = grid_buffer;
442 dvi.m_device_count = m_grid_memory_info.m_grid_device_count;
443 dvi.m_device_final_ptr = m_host_or_device_memory_for_reduced_value;
444 dvi.m_host_final_ptr = m_grid_memory_info.m_host_memory_for_reduced_value;
445 dvi.m_current_value = m_local_value;
446 dvi.m_identity = m_identity;
448 ReduceFunctor::applyDevice(dvi);
456 if (!m_is_master_instance)
457 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
472template <
typename DataType,
typename ReduceFunctor>
473class HostDeviceReducer
474:
public HostDeviceReducerBase<DataType, ReduceFunctor>
478 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
482 explicit HostDeviceReducer(RunCommand& command)
485 HostDeviceReducer(
const HostDeviceReducer& rhs) =
default;
486 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
495 return this->_reduce();
498 DataType reducedValue()
500 return this->_reduce();
509template <
typename DataType,
typename ReduceFunctor>
510class HostDeviceReducer2
511:
public HostDeviceReducerBase<DataType, ReduceFunctor>
513 friend impl::KernelReducerHelper;
514 friend ::Arcane::impl::HostReducerHelper;
518 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
519 using BaseClass::m_grid_memory_info;
520 using BaseClass::m_host_or_device_memory_for_reduced_value;
521 using BaseClass::m_local_value;
525 explicit HostDeviceReducer2(RunCommand& command)
531 DataType reducedValue()
533 return this->_reduce();
541 void _internalReduceHost()
546#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
547 ARCCORE_HOST_DEVICE
void _internalExecWorkItem(Int32)
553#if defined(ARCANE_COMPILING_SYCL)
554 void _internalExecWorkItem(sycl::nd_item<1>
id)
556 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
557 const Int32 local_id =
static_cast<Int32
>(
id.get_local_id(0));
558 const Int32 group_id =
static_cast<Int32
>(
id.get_group_linear_id());
559 const Int32 nb_block =
static_cast<Int32
>(
id.get_group_range(0));
561 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
562 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
565 DataType v = m_local_value;
566 bool is_last =
false;
567 auto sycl_functor = ReduceFunctor::syclFunctor();
568 DataType local_sum = sycl::reduce_over_group(
id.get_group(), v, sycl_functor);
570 grid_buffer[group_id] = local_sum;
579#if defined(__ADAPTIVECPP__)
580 int* atomic_counter_ptr_as_int =
reinterpret_cast<int*
>(atomic_counter_ptr);
581 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
583 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
585 Int32 cx = a.fetch_add(1);
586 if (cx == (nb_block - 1))
593 DataType my_total = grid_buffer[0];
594 for (
int x = 1; x < nb_block; ++x)
595 my_total = sycl_functor(my_total, grid_buffer[x]);
597 grid_buffer[0] = my_total;
598 *m_host_or_device_memory_for_reduced_value = my_total;
599 *atomic_counter_ptr = 0;
613template <
typename DataType,
typename ReduceFunctor>
618 explicit SyclReducer(RunCommand&) {}
624 return m_local_value;
626 void setValue(DataType v) { m_local_value = v; }
630 mutable DataType m_local_value = {};
636#if defined(ARCANE_COMPILING_SYCL)
637template <
typename DataType,
typename ReduceFunctor>
using Reducer = SyclReducer<DataType, ReduceFunctor>;
639template <
typename DataType,
typename ReduceFunctor>
using Reducer = HostDeviceReducer<DataType, ReduceFunctor>;
650template <
typename DataType>
652:
public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
654 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
655 using BaseClass::m_local_value;
659 explicit ReducerSum(RunCommand& command)
665 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
668 return m_local_value;
671 ARCCORE_HOST_DEVICE DataType
add(DataType v)
const
682template <
typename DataType>
684:
public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
686 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
687 using BaseClass::m_local_value;
691 explicit ReducerMax(RunCommand& command)
697 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
699 m_local_value = v > m_local_value ? v : m_local_value;
700 return m_local_value;
703 ARCCORE_HOST_DEVICE DataType
max(DataType v)
const
717template <
typename DataType>
719:
public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
721 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
722 using BaseClass::m_local_value;
726 explicit ReducerMin(RunCommand& command)
732 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
734 m_local_value = v < m_local_value ? v : m_local_value;
735 return m_local_value;
738 ARCCORE_HOST_DEVICE DataType
min(DataType v)
const
752template <
typename DataType>
754:
public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
756 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
760 explicit ReducerSum2(RunCommand& command)
766 ARCCORE_HOST_DEVICE
void combine(DataType v)
768 this->m_local_value += v;
777template <
typename DataType>
779:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
781 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
785 explicit ReducerMax2(RunCommand& command)
791 ARCCORE_HOST_DEVICE
void combine(DataType v)
793 DataType& lv = this->m_local_value;
794 lv = v > lv ? v : lv;
803template <
typename DataType>
805:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
807 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
811 explicit ReducerMin2(RunCommand& command)
817 ARCCORE_HOST_DEVICE
void combine(DataType v)
819 DataType& lv = this->m_local_value;
820 lv = v < lv ? v : lv;
836#define ARCANE_INLINE_REDUCE_IMPL
838#ifdef ARCANE_INLINE_REDUCE_IMPL
840# ifndef ARCANE_INLINE_REDUCE
841# define ARCANE_INLINE_REDUCE inline
844#if defined(__CUDACC__) || defined(__HIP__)
845# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
855#include "arcane/utils/NumArray.h"
856#include "arcane/utils/FatalErrorException.h"
857#include "arcane/accelerator/core/RunQueue.h"
862namespace Arcane::Accelerator::impl
864template <
typename DataType>
865class GenericReducerIf;
871template <
typename DataType,
typename Operator>
874template <
typename DataType>
879 using ReducerType = ReducerMax2<DataType>;
881template <
typename DataType>
886 using ReducerType = ReducerMin2<DataType>;
888template <
typename DataType>
893 using ReducerType = ReducerSum2<DataType>;
904template <
typename DataType>
917 DataType _reducedValue()
const
920 return m_host_reduce_storage[0];
926 if (m_host_reduce_storage.memoryRessource() != r)
928 m_host_reduce_storage.resize(1);
949template <
typename DataType>
957 template <
typename InputIterator,
typename ReduceOperator>
959 InputIterator input_iter, ReduceOperator reduce_op,
const TraceInfo& trace_info)
963 command << trace_info;
967 switch (exec_policy) {
968#if defined(ARCANE_COMPILING_CUDA)
970 size_t temp_storage_size = 0;
971 cudaStream_t stream = impl::CudaUtils::toNativeStream(queue);
972 DataType* reduced_value_ptr =
nullptr;
974 ARCANE_CHECK_CUDA(::cub::DeviceReduce::Reduce(
nullptr, temp_storage_size, input_iter, reduced_value_ptr,
975 nb_item, reduce_op, init_value, stream));
977 s.m_algo_storage.allocate(temp_storage_size);
978 reduced_value_ptr = s.m_device_reduce_storage.allocate();
979 ARCANE_CHECK_CUDA(::cub::DeviceReduce::Reduce(s.m_algo_storage.address(), temp_storage_size,
980 input_iter, reduced_value_ptr, nb_item,
981 reduce_op, init_value, stream));
982 s.m_device_reduce_storage.copyToAsync(s.m_host_reduce_storage, queue);
985#if defined(ARCANE_COMPILING_HIP)
987 size_t temp_storage_size = 0;
988 hipStream_t stream = impl::HipUtils::toNativeStream(queue);
989 DataType* reduced_value_ptr =
nullptr;
991 ARCANE_CHECK_HIP(rocprim::reduce(
nullptr, temp_storage_size, input_iter, reduced_value_ptr, init_value,
992 nb_item, reduce_op, stream));
994 s.m_algo_storage.allocate(temp_storage_size);
995 reduced_value_ptr = s.m_device_reduce_storage.allocate();
997 ARCANE_CHECK_HIP(rocprim::reduce(s.m_algo_storage.address(), temp_storage_size, input_iter, reduced_value_ptr, init_value,
998 nb_item, reduce_op, stream));
999 s.m_device_reduce_storage.copyToAsync(s.m_host_reduce_storage, queue);
1002#if defined(ARCANE_COMPILING_SYCL)
1007 ReducerType reducer(command2);
1011 reducer.combine(input_iter[i]);
1014 s.m_host_reduce_storage[0] = reducer.reducedValue();
1022 DataType reduced_value = init_value;
1023 for (
Int32 i = 0; i < nb_item; ++i) {
1024 reduced_value = reduce_op(reduced_value, *input_iter);
1027 s.m_host_reduce_storage[0] = reduced_value;
1083template <
typename DataType>
1116 template <
typename SelectLambda>
1123 template <
typename SelectLambda>
1130 template <
typename SelectLambda>
1139 m_is_already_called =
false;
1140 return this->_reducedValue();
1145 bool m_is_already_called =
false;
1149 template <
typename InputIterator,
typename ReduceOperator>
1150 void _apply(
Int32 nb_value, InputIterator input_iter, ReduceOperator reduce_op,
const TraceInfo& trace_info)
1155 DataType init_value = reduce_op.defaultValue();
1156 gf.apply(*base_ptr, nb_value, init_value, input_iter, reduce_op, trace_info);
1159 template <
typename GetterLambda,
typename ReduceOperator>
1160 void _applyWithIndex(
Int32 nb_value,
const GetterLambda& getter_lambda,
1161 ReduceOperator reduce_op,
const TraceInfo& trace_info)
1164 impl::GenericReducerBase<DataType>* base_ptr =
this;
1165 impl::GenericReducerIf<DataType> gf;
1166 impl::GetterLambdaIterator<DataType, GetterLambda> input_iter(getter_lambda);
1167 DataType init_value = reduce_op.defaultValue();
1168 gf.apply(*base_ptr, nb_value, init_value, input_iter, reduce_op, trace_info);
1173 if (m_is_already_called)
1174 ARCANE_FATAL(
"apply() has already been called for this instance");
1175 m_is_already_called =
true;
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle sur accélérateur avec arguments supplémentaires pour les réductions.
Algorithme générique de réduction sur accélérateur.
DataType reducedValue()
Valeur de la réduction.
void applyMinWithIndex(Int32 nb_value, const SelectLambda &select_lambda, const TraceInfo &trace_info=TraceInfo())
Applique une réduction 'Min' sur les valeurs sélectionnées par select_lambda.
void applyMaxWithIndex(Int32 nb_value, const SelectLambda &select_lambda, const TraceInfo &trace_info=TraceInfo())
Applique une réduction 'Max' sur les valeurs sélectionnées par select_lambda.
void applyMax(SmallSpan< const DataType > values, const TraceInfo &trace_info=TraceInfo())
Applique une réduction 'Max' sur les valeurs values.
void applyMin(SmallSpan< const DataType > values, const TraceInfo &trace_info=TraceInfo())
Applique une réduction 'Min' sur les valeurs values.
void applySum(SmallSpan< const DataType > values, const TraceInfo &trace_info=TraceInfo())
Applique une réduction 'Somme' sur les valeurs values.
void applySumWithIndex(Int32 nb_value, const SelectLambda &select_lambda, const TraceInfo &trace_info=TraceInfo())
Applique une réduction 'Somme' sur les valeurs sélectionnées par select_lambda.
Gestion d'une commande sur accélérateur.
File d'exécution pour un accélérateur.
void barrier() const
Bloque tant que toutes les commandes associées à la file ne sont pas terminées.
eExecutionPolicy executionPolicy() const
Politique d'exécution de la file.
Opérateur de Scan/Reduce pour le maximum.
Opérateur de Scan/Reduce pour le minimum.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
unsigned int * m_device_count
bool m_use_grid_reduce
Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
DataType m_current_value
Valeur du thread courant à réduire.
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
DataType m_identity
Valeur de l'identité pour la réduction.
void beginExecute()
Indique qu'on commence l'exécution de la commande.
void endExecute()
Signale la fin de l'exécution.
Opérateur de Scan/Reduce pour les sommes.
Tableaux multi-dimensionnels pour les types numériques accessibles sur accélérateurs.
Vue d'un tableau d'éléments de type T.
constexpr __host__ __device__ SizeType size() const noexcept
Retourne la taille du tableau.
constexpr __host__ __device__ pointer data() const noexcept
Pointeur sur le début de la vue.
__host__ __device__ Real2 min(Real2 a, Real2 b)
Retourne le minimum de deux Real2.
T max(const T &a, const T &b, const T &c)
Retourne le maximum de trois éléments.
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
@ Atomic
Utilise des opérations atomiques entre les blocs.
eExecutionPolicy
Politique d'exécution pour un Runner.
@ SYCL
Politique d'exécution utilisant l'environnement SYCL.
@ HIP
Politique d'exécution utilisant l'environnement HIP.
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
@ Sequential
Politique d'exécution séquentielle.
@ Thread
Politique d'exécution multi-thread.
void add(ArrayView< T > lhs, ConstArrayView< T > copy_array)
Ajoute le tableau copy_array dans l'instance.
eMemoryRessource
Liste des ressources mémoire disponibles.
@ HostPinned
Alloue sur l'hôte.
Espace de nom de Arccore.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.