12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
39namespace Arcane::Accelerator::impl
41class KernelReducerHelper;
47internalGetOrCreateReduceMemoryImpl(RunCommand* command);
49template <
typename DataType>
57 ARCCORE_HOST_DEVICE
static constexpr double sumValue() {
return 0.0; }
58 ARCCORE_HOST_DEVICE
static constexpr double minValue() {
return DBL_MAX; }
59 ARCCORE_HOST_DEVICE
static constexpr double maxValue() {
return -DBL_MAX; }
66 ARCCORE_HOST_DEVICE
static constexpr Int32 sumValue() {
return 0; }
67 ARCCORE_HOST_DEVICE
static constexpr Int32 minValue() {
return INT32_MAX; }
68 ARCCORE_HOST_DEVICE
static constexpr Int32 maxValue() {
return -INT32_MAX; }
75 ARCCORE_HOST_DEVICE
static constexpr Int64 sumValue() {
return 0; }
76 ARCCORE_HOST_DEVICE
static constexpr Int64 minValue() {
return INT64_MAX; }
77 ARCCORE_HOST_DEVICE
static constexpr Int64 maxValue() {
return -INT64_MAX; }
90template <
typename DataType>
118template <
typename DataType>
126 static double apply(
double* vptr,
double v)
128 std::atomic_ref<double> aref(*vptr);
129 double old = aref.load(std::memory_order_consume);
130 double wanted = old + v;
131 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
143 std::atomic_ref<Int64> aref(*vptr);
144 Int64 x = aref.fetch_add(v);
155 std::atomic_ref<Int32> aref(*vptr);
156 Int32 x = aref.fetch_add(v);
164template <
typename DataType>
169 static ARCCORE_DEVICE DataType
172 _applyDevice(dev_info);
175 static DataType apply(DataType* vptr, DataType v)
179#if defined(ARCANE_COMPILING_SYCL)
180 static sycl::plus<DataType> syclFunctor() {
return {}; }
195template <
typename DataType>
200 static ARCCORE_DEVICE DataType
203 _applyDevice(dev_info);
206 static DataType apply(DataType* ptr, DataType v)
208 std::atomic_ref<DataType> aref(*ptr);
209 DataType prev_value = aref.load();
210 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
214#if defined(ARCANE_COMPILING_SYCL)
215 static sycl::maximum<DataType> syclFunctor() {
return {}; }
230template <
typename DataType>
235 static ARCCORE_DEVICE DataType
238 _applyDevice(dev_info);
241 static DataType apply(DataType* vptr, DataType v)
243 std::atomic_ref<DataType> aref(*vptr);
244 DataType prev_value = aref.load();
245 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
249#if defined(ARCANE_COMPILING_SYCL)
250 static sycl::minimum<DataType> syclFunctor() {
return {}; }
296template <
typename DataType,
typename ReduceFunctor>
297class HostDeviceReducerBase
303 , m_command(&command)
306 m_is_master_instance =
true;
307 m_identity = ReduceFunctor::identity();
308 m_local_value = m_identity;
309 m_atomic_value = m_identity;
310 m_atomic_parent_value = &m_atomic_value;
312 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
315 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
319#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
320 HostDeviceReducerBase(
const HostDeviceReducerBase& rhs) =
default;
322 ARCCORE_HOST_DEVICE HostDeviceReducerBase(
const HostDeviceReducerBase& rhs)
324 , m_local_value(rhs.m_local_value)
325 , m_identity(rhs.m_identity)
327#ifdef ARCCORE_DEVICE_CODE
328 m_grid_memory_info = rhs.m_grid_memory_info;
333 m_memory_impl = rhs.m_memory_impl;
335 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
338 m_atomic_parent_value = rhs.m_atomic_parent_value;
339 m_local_value = rhs.m_identity;
340 m_atomic_value = m_identity;
349 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) =
delete;
350 HostDeviceReducerBase& operator=(
const HostDeviceReducerBase& rhs) =
delete;
354 ARCCORE_HOST_DEVICE
void setValue(DataType v)
358 ARCCORE_HOST_DEVICE DataType localValue()
const
360 return m_local_value;
381 mutable DataType m_local_value;
382 DataType* m_atomic_parent_value =
nullptr;
383 mutable DataType m_atomic_value;
389 bool m_is_master_instance =
false;
396 if (!m_is_master_instance)
397 ARCANE_FATAL(
"Final reduce operation is only valid on master instance");
401 m_memory_impl->copyReduceValueFromDevice();
402 final_ptr =
reinterpret_cast<DataType*
>(m_grid_memory_info.m_host_memory_for_reduced_value);
403 m_memory_impl->release();
404 m_memory_impl =
nullptr;
407 if (m_atomic_parent_value) {
411 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
412 *final_ptr = *m_atomic_parent_value;
424 ARCCORE_HOST_DEVICE
void
427#ifdef ARCCORE_DEVICE_CODE
432 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
443 ReduceFunctor::applyDevice(dvi);
451 if (!m_is_master_instance)
452 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
467template <
typename DataType,
typename ReduceFunctor>
468class HostDeviceReducer
469:
public HostDeviceReducerBase<DataType, ReduceFunctor>
473 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
477 explicit HostDeviceReducer(
RunCommand& command)
480 HostDeviceReducer(
const HostDeviceReducer& rhs) =
default;
481 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
493 DataType reducedValue()
504template <
typename DataType,
typename ReduceFunctor>
505class HostDeviceReducer2
506:
public HostDeviceReducerBase<DataType, ReduceFunctor>
509 friend ::Arcane::impl::HostReducerHelper;
513 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
514 using BaseClass::m_grid_memory_info;
516 using BaseClass::m_local_value;
520 explicit HostDeviceReducer2(
RunCommand& command)
526 DataType reducedValue()
536 void _internalReduceHost()
541#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
542 ARCCORE_HOST_DEVICE
void _internalExecWorkItemAtEnd(Int32)
546 ARCCORE_HOST_DEVICE
void _internalExecWorkItemAtBegin(
Int32){}
549#if defined(ARCANE_COMPILING_SYCL)
550 void _internalExecWorkItemAtBegin(sycl::nd_item<1>){}
551 void _internalExecWorkItemAtEnd(sycl::nd_item<1>
id)
553 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
554 const Int32 local_id =
static_cast<Int32>(
id.get_local_id(0));
555 const Int32 group_id =
static_cast<Int32>(
id.get_group_linear_id());
556 const Int32 nb_block =
static_cast<Int32>(
id.get_group_range(0));
558 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
559 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
560 SmallSpan<DataType> grid_buffer(buf,
static_cast<Int32>(buf_span.size()));
562 DataType v = m_local_value;
563 bool is_last =
false;
564 auto sycl_functor = ReduceFunctor::syclFunctor();
565 DataType local_sum = sycl::reduce_over_group(
id.get_group(), v, sycl_functor);
567 grid_buffer[group_id] = local_sum;
576#if defined(__ADAPTIVECPP__)
577 int* atomic_counter_ptr_as_int =
reinterpret_cast<int*
>(atomic_counter_ptr);
578 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
580 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
582 Int32 cx = a.fetch_add(1);
583 if (cx == (nb_block - 1))
590 DataType my_total = grid_buffer[0];
591 for (
int x = 1; x < nb_block; ++x)
592 my_total = sycl_functor(my_total, grid_buffer[x]);
594 grid_buffer[0] = my_total;
596 *atomic_counter_ptr = 0;
610template <
typename DataType,
typename ReduceFunctor>
621 return m_local_value;
623 void setValue(DataType v) { m_local_value = v; }
627 mutable DataType m_local_value = {};
633#if defined(ARCANE_COMPILING_SYCL)
647template <
typename DataType>
649:
public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
651 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
652 using BaseClass::m_local_value;
662 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
665 return m_local_value;
668 ARCCORE_HOST_DEVICE DataType add(DataType v)
const
679template <
typename DataType>
681:
public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
683 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
684 using BaseClass::m_local_value;
694 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
696 m_local_value = v > m_local_value ? v : m_local_value;
697 return m_local_value;
700 ARCCORE_HOST_DEVICE DataType max(DataType v)
const
714template <
typename DataType>
716:
public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
718 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
719 using BaseClass::m_local_value;
729 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
731 m_local_value = v < m_local_value ? v : m_local_value;
732 return m_local_value;
735 ARCCORE_HOST_DEVICE DataType min(DataType v)
const
749template <
typename DataType>
751:
public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
753 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
763 ARCCORE_HOST_DEVICE
void combine(DataType v)
765 this->m_local_value += v;
774template <
typename DataType>
776:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
778 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
788 ARCCORE_HOST_DEVICE
void combine(DataType v)
790 DataType& lv = this->m_local_value;
791 lv = v > lv ? v : lv;
800template <
typename DataType>
802:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
804 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
814 ARCCORE_HOST_DEVICE
void combine(DataType v)
816 DataType& lv = this->m_local_value;
817 lv = v < lv ? v : lv;
833#define ARCANE_INLINE_REDUCE_IMPL
835#ifdef ARCANE_INLINE_REDUCE_IMPL
837# ifndef ARCANE_INLINE_REDUCE
838# define ARCANE_INLINE_REDUCE inline
841#if defined(__CUDACC__) || defined(__HIP__)
842# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
857#include "arcane/accelerator/GenericReducer.h"
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType _reduce()
Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
Version 1 de la réduction.
Gestion d'une commande sur accélérateur.
Implémentation de la réduction pour le backend SYCL.
Interface de la gestion mémoire pour les réductions.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
Informations pour effectuer une réduction sur un device.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
unsigned int * m_device_count
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
DataType m_current_value
Valeur du thread courant à réduire.
Int32 m_warp_size
Taille d'un warp.
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
DataType m_identity
Valeur de l'identité pour la réduction.
constexpr SpanType bytes() const
Vue sous forme d'octets.
Vue d'un tableau d'éléments de type T.
Classe pour appliquer la finalisation des réductions.
Espace de nom pour l'utilisation des accélérateurs.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.
Informations mémoire pour la réduction sur les accélérateurs.
void * m_host_memory_for_reduced_value
Pointeur vers la mémoire sur l'hôte contenant la valeur réduite.
MutableMemoryView m_grid_memory_values
Mémoire allouée pour la réduction sur une grille (de taille nb_bloc * sizeof(T))
Int32 m_warp_size
Taille d'un warp.
unsigned int * m_grid_device_count
Entier utilisé pour compter le nombre de blocs ayant déjà fait leur partie de la réduction.