12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
36class HostReducerHelper;
39namespace Arcane::Accelerator::impl
41class KernelReducerHelper;
51extern "C++" ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl*
52internalGetOrCreateReduceMemoryImpl(RunCommand* command);
54template <
typename DataType>
62 ARCCORE_HOST_DEVICE
static constexpr double sumValue() {
return 0.0; }
63 ARCCORE_HOST_DEVICE
static constexpr double minValue() {
return DBL_MAX; }
64 ARCCORE_HOST_DEVICE
static constexpr double maxValue() {
return -DBL_MAX; }
71 ARCCORE_HOST_DEVICE
static constexpr Int32 sumValue() {
return 0; }
72 ARCCORE_HOST_DEVICE
static constexpr Int32 minValue() {
return INT32_MAX; }
73 ARCCORE_HOST_DEVICE
static constexpr Int32 maxValue() {
return -INT32_MAX; }
80 ARCCORE_HOST_DEVICE
static constexpr Int64 sumValue() {
return 0; }
81 ARCCORE_HOST_DEVICE
static constexpr Int64 minValue() {
return INT64_MAX; }
82 ARCCORE_HOST_DEVICE
static constexpr Int64 maxValue() {
return -INT64_MAX; }
95template <
typename DataType>
123template <
typename DataType>
131 static double apply(
double* vptr,
double v)
133 std::atomic_ref<double> aref(*vptr);
134 double old = aref.load(std::memory_order_consume);
135 double wanted = old + v;
136 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
148 std::atomic_ref<Int64> aref(*vptr);
149 Int64 x = aref.fetch_add(v);
160 std::atomic_ref<Int32> aref(*vptr);
161 Int32 x = aref.fetch_add(v);
169template <
typename DataType>
174 static ARCCORE_DEVICE DataType
177 _applyDevice(dev_info);
180 static DataType apply(DataType* vptr, DataType v)
184#if defined(ARCANE_COMPILING_SYCL)
185 static sycl::plus<DataType> syclFunctor() {
return {}; }
200template <
typename DataType>
205 static ARCCORE_DEVICE DataType
208 _applyDevice(dev_info);
211 static DataType apply(DataType* ptr, DataType v)
213 std::atomic_ref<DataType> aref(*ptr);
214 DataType prev_value = aref.load();
215 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
219#if defined(ARCANE_COMPILING_SYCL)
220 static sycl::maximum<DataType> syclFunctor() {
return {}; }
235template <
typename DataType>
240 static ARCCORE_DEVICE DataType
243 _applyDevice(dev_info);
246 static DataType apply(DataType* vptr, DataType v)
248 std::atomic_ref<DataType> aref(*vptr);
249 DataType prev_value = aref.load();
250 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
254#if defined(ARCANE_COMPILING_SYCL)
255 static sycl::minimum<DataType> syclFunctor() {
return {}; }
301template <
typename DataType,
typename ReduceFunctor>
302class HostDeviceReducerBase
307 : m_host_or_device_memory_for_reduced_value(&m_local_value)
308 , m_command(&command)
311 m_is_master_instance =
true;
312 m_identity = ReduceFunctor::identity();
313 m_local_value = m_identity;
314 m_atomic_value = m_identity;
315 m_atomic_parent_value = &m_atomic_value;
317 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
319 m_host_or_device_memory_for_reduced_value = impl::allocateReduceDataMemory<DataType>(m_memory_impl, m_identity);
320 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
324#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
325 HostDeviceReducerBase(
const HostDeviceReducerBase& rhs) =
default;
327 ARCCORE_HOST_DEVICE HostDeviceReducerBase(
const HostDeviceReducerBase& rhs)
328 : m_host_or_device_memory_for_reduced_value(rhs.m_host_or_device_memory_for_reduced_value)
329 , m_local_value(rhs.m_local_value)
330 , m_identity(rhs.m_identity)
332#ifdef ARCCORE_DEVICE_CODE
333 m_grid_memory_info = rhs.m_grid_memory_info;
338 m_memory_impl = rhs.m_memory_impl;
340 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
343 m_atomic_parent_value = rhs.m_atomic_parent_value;
344 m_local_value = rhs.m_identity;
345 m_atomic_value = m_identity;
354 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) =
delete;
355 HostDeviceReducerBase& operator=(
const HostDeviceReducerBase& rhs) =
delete;
359 ARCCORE_HOST_DEVICE
void setValue(DataType v)
363 ARCCORE_HOST_DEVICE DataType localValue()
const
365 return m_local_value;
370 impl::IReduceMemoryImpl* m_memory_impl =
nullptr;
377 DataType* m_host_or_device_memory_for_reduced_value =
nullptr;
378 impl::IReduceMemoryImpl::GridMemoryInfo m_grid_memory_info;
382 RunCommand* m_command =
nullptr;
386 mutable DataType m_local_value;
387 DataType* m_atomic_parent_value =
nullptr;
388 mutable DataType m_atomic_value;
394 bool m_is_master_instance =
false;
401 if (!m_is_master_instance)
402 ARCANE_FATAL(
"Final reduce operation is only valid on master instance");
404 DataType* final_ptr = m_host_or_device_memory_for_reduced_value;
406 m_memory_impl->copyReduceValueFromDevice();
407 final_ptr =
reinterpret_cast<DataType*
>(m_grid_memory_info.m_host_memory_for_reduced_value);
408 m_memory_impl->release();
409 m_memory_impl =
nullptr;
412 if (m_atomic_parent_value) {
416 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
417 *final_ptr = *m_atomic_parent_value;
429 ARCCORE_HOST_DEVICE
void
432#ifdef ARCCORE_DEVICE_CODE
436 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
437 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
440 impl::ReduceDeviceInfo<DataType> dvi;
441 dvi.m_grid_buffer = grid_buffer;
442 dvi.m_device_count = m_grid_memory_info.m_grid_device_count;
443 dvi.m_device_final_ptr = m_host_or_device_memory_for_reduced_value;
444 dvi.m_host_final_ptr = m_grid_memory_info.m_host_memory_for_reduced_value;
445 dvi.m_current_value = m_local_value;
446 dvi.m_identity = m_identity;
448 ReduceFunctor::applyDevice(dvi);
456 if (!m_is_master_instance)
457 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
472template <
typename DataType,
typename ReduceFunctor>
473class HostDeviceReducer
474:
public HostDeviceReducerBase<DataType, ReduceFunctor>
478 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
482 explicit HostDeviceReducer(RunCommand& command)
485 HostDeviceReducer(
const HostDeviceReducer& rhs) =
default;
486 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
495 return this->_reduce();
498 DataType reducedValue()
500 return this->_reduce();
509template <
typename DataType,
typename ReduceFunctor>
510class HostDeviceReducer2
511:
public HostDeviceReducerBase<DataType, ReduceFunctor>
513 friend impl::KernelRemainingArgsHelper;
514 friend ::Arcane::impl::HostReducerHelper;
518 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
519 using BaseClass::m_grid_memory_info;
520 using BaseClass::m_host_or_device_memory_for_reduced_value;
521 using BaseClass::m_local_value;
525 explicit HostDeviceReducer2(RunCommand& command)
531 DataType reducedValue()
533 return this->_reduce();
541 void _internalReduceHost()
546#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
547 ARCCORE_HOST_DEVICE
void _internalExecWorkItem(Int32)
553#if defined(ARCANE_COMPILING_SYCL)
554 void _internalExecWorkItem(sycl::nd_item<1>
id)
556 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
557 const Int32 local_id =
static_cast<Int32
>(
id.get_local_id(0));
558 const Int32 group_id =
static_cast<Int32
>(
id.get_group_linear_id());
559 const Int32 nb_block =
static_cast<Int32
>(
id.get_group_range(0));
561 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
562 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
565 DataType v = m_local_value;
566 bool is_last =
false;
567 auto sycl_functor = ReduceFunctor::syclFunctor();
568 DataType local_sum = sycl::reduce_over_group(
id.get_group(), v, sycl_functor);
570 grid_buffer[group_id] = local_sum;
579#if defined(__ADAPTIVECPP__)
580 int* atomic_counter_ptr_as_int =
reinterpret_cast<int*
>(atomic_counter_ptr);
581 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
583 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
585 Int32 cx = a.fetch_add(1);
586 if (cx == (nb_block - 1))
593 DataType my_total = grid_buffer[0];
594 for (
int x = 1; x < nb_block; ++x)
595 my_total = sycl_functor(my_total, grid_buffer[x]);
597 grid_buffer[0] = my_total;
598 *m_host_or_device_memory_for_reduced_value = my_total;
599 *atomic_counter_ptr = 0;
613template <
typename DataType,
typename ReduceFunctor>
618 explicit SyclReducer(RunCommand&) {}
624 return m_local_value;
626 void setValue(DataType v) { m_local_value = v; }
630 mutable DataType m_local_value = {};
636#if defined(ARCANE_COMPILING_SYCL)
637template <
typename DataType,
typename ReduceFunctor>
using Reducer = SyclReducer<DataType, ReduceFunctor>;
639template <
typename DataType,
typename ReduceFunctor>
using Reducer = HostDeviceReducer<DataType, ReduceFunctor>;
650template <
typename DataType>
652:
public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
654 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
655 using BaseClass::m_local_value;
659 explicit ReducerSum(RunCommand& command)
665 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
668 return m_local_value;
671 ARCCORE_HOST_DEVICE DataType
add(DataType v)
const
682template <
typename DataType>
684:
public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
686 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
687 using BaseClass::m_local_value;
691 explicit ReducerMax(RunCommand& command)
697 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
699 m_local_value = v > m_local_value ? v : m_local_value;
700 return m_local_value;
703 ARCCORE_HOST_DEVICE DataType
max(DataType v)
const
717template <
typename DataType>
719:
public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
721 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
722 using BaseClass::m_local_value;
726 explicit ReducerMin(RunCommand& command)
732 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
734 m_local_value = v < m_local_value ? v : m_local_value;
735 return m_local_value;
738 ARCCORE_HOST_DEVICE DataType
min(DataType v)
const
752template <
typename DataType>
754:
public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
756 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
760 explicit ReducerSum2(RunCommand& command)
766 ARCCORE_HOST_DEVICE
void combine(DataType v)
768 this->m_local_value += v;
777template <
typename DataType>
779:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
781 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
785 explicit ReducerMax2(RunCommand& command)
791 ARCCORE_HOST_DEVICE
void combine(DataType v)
793 DataType& lv = this->m_local_value;
794 lv = v > lv ? v : lv;
803template <
typename DataType>
805:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
807 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
811 explicit ReducerMin2(RunCommand& command)
817 ARCCORE_HOST_DEVICE
void combine(DataType v)
819 DataType& lv = this->m_local_value;
820 lv = v < lv ? v : lv;
836#define ARCANE_INLINE_REDUCE_IMPL
838#ifdef ARCANE_INLINE_REDUCE_IMPL
840# ifndef ARCANE_INLINE_REDUCE
841# define ARCANE_INLINE_REDUCE inline
844#if defined(__CUDACC__) || defined(__HIP__)
845# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
860#include "arcane/accelerator/GenericReducer.h"
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
Gestion d'une commande sur accélérateur.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
unsigned int * m_device_count
bool m_use_grid_reduce
Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
DataType m_current_value
Valeur du thread courant à réduire.
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
DataType m_identity
Valeur de l'identité pour la réduction.
Vue d'un tableau d'éléments de type T.
__host__ __device__ Real2 min(Real2 a, Real2 b)
Retourne le minimum de deux Real2.
T max(const T &a, const T &b, const T &c)
Retourne le maximum de trois éléments.
Espace de nom pour l'utilisation des accélérateurs.
@ Atomic
Utilise des opérations atomiques entre les blocs.
void add(ArrayView< T > lhs, ConstArrayView< T > copy_array)
Ajoute le tableau copy_array dans l'instance.
Espace de nom de Arccore.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.