12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
34namespace Arcane::Accelerator::Impl
41class HostReducerHelper;
44namespace Arcane::Accelerator::impl
46class KernelReducerHelper;
52internalGetOrCreateReduceMemoryImpl(RunCommand* command);
54template <
typename DataType>
62 ARCCORE_HOST_DEVICE
static constexpr double sumValue() {
return 0.0; }
63 ARCCORE_HOST_DEVICE
static constexpr double minValue() {
return DBL_MAX; }
64 ARCCORE_HOST_DEVICE
static constexpr double maxValue() {
return -DBL_MAX; }
71 ARCCORE_HOST_DEVICE
static constexpr Int32 sumValue() {
return 0; }
72 ARCCORE_HOST_DEVICE
static constexpr Int32 minValue() {
return INT32_MAX; }
73 ARCCORE_HOST_DEVICE
static constexpr Int32 maxValue() {
return -INT32_MAX; }
80 ARCCORE_HOST_DEVICE
static constexpr Int64 sumValue() {
return 0; }
81 ARCCORE_HOST_DEVICE
static constexpr Int64 minValue() {
return INT64_MAX; }
82 ARCCORE_HOST_DEVICE
static constexpr Int64 maxValue() {
return -INT64_MAX; }
95template <
typename DataType>
123template <
typename DataType>
131 static double apply(
double* vptr,
double v)
133 std::atomic_ref<double> aref(*vptr);
134 double old = aref.load(std::memory_order_consume);
135 double wanted = old + v;
136 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
148 std::atomic_ref<Int64> aref(*vptr);
149 Int64 x = aref.fetch_add(v);
160 std::atomic_ref<Int32> aref(*vptr);
161 Int32 x = aref.fetch_add(v);
169template <
typename DataType>
174 static ARCCORE_DEVICE DataType
177 _applyDevice(dev_info);
180 static DataType apply(DataType* vptr, DataType v)
184#if defined(ARCANE_COMPILING_SYCL)
185 static sycl::plus<DataType> syclFunctor() {
return {}; }
200template <
typename DataType>
205 static ARCCORE_DEVICE DataType
208 _applyDevice(dev_info);
211 static DataType apply(DataType* ptr, DataType v)
213 std::atomic_ref<DataType> aref(*ptr);
214 DataType prev_value = aref.load();
215 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
219#if defined(ARCANE_COMPILING_SYCL)
220 static sycl::maximum<DataType> syclFunctor() {
return {}; }
235template <
typename DataType>
240 static ARCCORE_DEVICE DataType
243 _applyDevice(dev_info);
246 static DataType apply(DataType* vptr, DataType v)
248 std::atomic_ref<DataType> aref(*vptr);
249 DataType prev_value = aref.load();
250 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
254#if defined(ARCANE_COMPILING_SYCL)
255 static sycl::minimum<DataType> syclFunctor() {
return {}; }
301template <
typename DataType,
typename ReduceFunctor>
302class HostDeviceReducerBase
308 , m_command(&command)
311 m_is_master_instance =
true;
312 m_identity = ReduceFunctor::identity();
313 m_local_value = m_identity;
314 m_atomic_value = m_identity;
315 m_atomic_parent_value = &m_atomic_value;
317 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
320 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
326#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
327 HostDeviceReducerBase(
const HostDeviceReducerBase& rhs) =
default;
329 ARCCORE_HOST_DEVICE HostDeviceReducerBase(
const HostDeviceReducerBase& rhs)
331 , m_local_value(rhs.m_local_value)
332 , m_identity(rhs.m_identity)
334#ifdef ARCCORE_DEVICE_CODE
335 m_grid_memory_info = rhs.m_grid_memory_info;
340 m_memory_impl = rhs.m_memory_impl;
342 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
345 m_atomic_parent_value = rhs.m_atomic_parent_value;
346 m_local_value = rhs.m_identity;
347 m_atomic_value = m_identity;
356 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) =
delete;
357 HostDeviceReducerBase& operator=(
const HostDeviceReducerBase& rhs) =
delete;
361 ARCCORE_HOST_DEVICE
void setValue(DataType v)
365 ARCCORE_HOST_DEVICE DataType localValue()
const
367 return m_local_value;
388 mutable DataType m_local_value;
389 DataType* m_atomic_parent_value =
nullptr;
390 mutable DataType m_atomic_value;
396 bool m_is_master_instance =
false;
403 if (!m_is_master_instance)
404 ARCANE_FATAL(
"Final reduce operation is only valid on master instance");
408 m_memory_impl->copyReduceValueFromDevice();
409 final_ptr =
reinterpret_cast<DataType*
>(m_grid_memory_info.m_host_memory_for_reduced_value);
410 m_memory_impl->release();
411 m_memory_impl =
nullptr;
414 if (m_atomic_parent_value) {
418 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
419 *final_ptr = *m_atomic_parent_value;
431 ARCCORE_HOST_DEVICE
void
434#ifdef ARCCORE_DEVICE_CODE
439 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
450 ReduceFunctor::applyDevice(dvi);
458 if (!m_is_master_instance)
459 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
474template <
typename DataType,
typename ReduceFunctor>
475class HostDeviceReducer
476:
public HostDeviceReducerBase<DataType, ReduceFunctor>
480 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
484 explicit HostDeviceReducer(
RunCommand& command)
487 HostDeviceReducer(
const HostDeviceReducer& rhs) =
default;
488 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
500 DataType reducedValue()
511template <
typename DataType,
typename ReduceFunctor>
512class HostDeviceReducer2
513:
public HostDeviceReducerBase<DataType, ReduceFunctor>
519 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
520 using BaseClass::m_grid_memory_info;
522 using BaseClass::m_local_value;
528 explicit HostDeviceReducer2(
RunCommand& command)
534 DataType reducedValue()
542#if defined(ARCANE_COMPILING_SYCL)
543 void _internalExecWorkItemAtEnd(sycl::nd_item<1>
id)
546 const Int32 local_id =
static_cast<Int32>(
id.get_local_id(0));
547 const Int32 group_id =
static_cast<Int32>(
id.get_group_linear_id());
548 const Int32 nb_block =
static_cast<Int32>(
id.get_group_range(0));
551 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
554 DataType v = m_local_value;
555 bool is_last =
false;
556 auto sycl_functor = ReduceFunctor::syclFunctor();
557 DataType local_sum = sycl::reduce_over_group(
id.get_group(), v, sycl_functor);
559 grid_buffer[group_id] = local_sum;
568#if defined(__ADAPTIVECPP__)
569 int* atomic_counter_ptr_as_int =
reinterpret_cast<int*
>(atomic_counter_ptr);
570 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
572 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
574 Int32 cx = a.fetch_add(1);
575 if (cx == (nb_block - 1))
582 DataType my_total = grid_buffer[0];
583 for (
int x = 1; x < nb_block; ++x)
584 my_total = sycl_functor(my_total, grid_buffer[x]);
586 grid_buffer[0] = my_total;
588 *atomic_counter_ptr = 0;
602template <
typename DataType,
typename ReduceFunctor>
613 return m_local_value;
615 void setValue(DataType v) { m_local_value = v; }
619 mutable DataType m_local_value = {};
625#if defined(ARCANE_COMPILING_SYCL)
639template <
typename DataType>
641:
public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
643 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
644 using BaseClass::m_local_value;
654 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
657 return m_local_value;
660 ARCCORE_HOST_DEVICE DataType add(DataType v)
const
671template <
typename DataType>
673:
public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
675 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
676 using BaseClass::m_local_value;
686 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
688 m_local_value = v > m_local_value ? v : m_local_value;
689 return m_local_value;
692 ARCCORE_HOST_DEVICE DataType max(DataType v)
const
706template <
typename DataType>
708:
public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
710 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
711 using BaseClass::m_local_value;
721 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
723 m_local_value = v < m_local_value ? v : m_local_value;
724 return m_local_value;
727 ARCCORE_HOST_DEVICE DataType min(DataType v)
const
741template <
typename DataType>
743:
public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
745 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
755 ARCCORE_HOST_DEVICE
void combine(DataType v)
757 this->m_local_value += v;
766template <
typename DataType>
768:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
770 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
780 ARCCORE_HOST_DEVICE
void combine(DataType v)
782 DataType& lv = this->m_local_value;
783 lv = v > lv ? v : lv;
792template <
typename DataType>
794:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
796 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
806 ARCCORE_HOST_DEVICE
void combine(DataType v)
808 DataType& lv = this->m_local_value;
809 lv = v < lv ? v : lv;
823 template <
typename DataType,
typename ReduceFunctor>
828 template <
typename DataType,
typename ReduceFunctor>
835 template <
typename DataType,
typename ReduceFunctor>
836 static ARCCORE_DEVICE
void
841 template <
typename DataType,
typename ReduceFunctor>
842 static ARCCORE_DEVICE
void
848#if defined(ARCANE_COMPILING_SYCL)
849 template <
typename DataType,
typename ReduceFunctor>
854 template <
typename DataType,
typename ReduceFunctor>
858 reducer._internalExecWorkItemAtEnd(
id);
875#define ARCANE_INLINE_REDUCE_IMPL
877#ifdef ARCANE_INLINE_REDUCE_IMPL
879# ifndef ARCANE_INLINE_REDUCE
880# define ARCANE_INLINE_REDUCE inline
883#if defined(__CUDACC__) || defined(__HIP__)
884# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
899#include "arcane/accelerator/GenericReducer.h"
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
Version 2 de la réduction.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType _reduce()
Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
Version 1 de la réduction.
Classe pour gérer les arguments de type HostDeviceReducer2 en début et fin d'exécution des noyaux.
Gestion d'une commande sur accélérateur.
Implémentation de la réduction pour le backend SYCL.
Interface de la gestion mémoire pour les réductions.
Informations pour effectuer une réduction sur un device.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
unsigned int * m_device_count
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
DataType m_current_value
Valeur du thread courant à réduire.
Int32 m_warp_size
Taille d'un warp.
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
DataType m_identity
Valeur de l'identité pour la réduction.
constexpr SpanType bytes() const
Vue sous forme d'octets.
Vue d'un tableau d'éléments de type T.
Espace de nom pour l'utilisation des accélérateurs.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.
Informations mémoire pour la réduction sur les accélérateurs.
void * m_host_memory_for_reduced_value
Pointeur vers la mémoire sur l'hôte contenant la valeur réduite.
MutableMemoryView m_grid_memory_values
Mémoire allouée pour la réduction sur une grille (de taille nb_bloc * sizeof(T))
Int32 m_warp_size
Taille d'un warp.
unsigned int * m_grid_device_count
Entier utilisé pour compter le nombre de blocs ayant déjà fait leur partie de la réduction.