12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
39namespace Arcane::Accelerator::impl
41class KernelReducerHelper;
47internalGetOrCreateReduceMemoryImpl(RunCommand* command);
49template <
typename DataType>
57 ARCCORE_HOST_DEVICE
static constexpr double sumValue() {
return 0.0; }
58 ARCCORE_HOST_DEVICE
static constexpr double minValue() {
return DBL_MAX; }
59 ARCCORE_HOST_DEVICE
static constexpr double maxValue() {
return -DBL_MAX; }
66 ARCCORE_HOST_DEVICE
static constexpr Int32 sumValue() {
return 0; }
67 ARCCORE_HOST_DEVICE
static constexpr Int32 minValue() {
return INT32_MAX; }
68 ARCCORE_HOST_DEVICE
static constexpr Int32 maxValue() {
return -INT32_MAX; }
75 ARCCORE_HOST_DEVICE
static constexpr Int64 sumValue() {
return 0; }
76 ARCCORE_HOST_DEVICE
static constexpr Int64 minValue() {
return INT64_MAX; }
77 ARCCORE_HOST_DEVICE
static constexpr Int64 maxValue() {
return -INT64_MAX; }
90template <
typename DataType>
118template <
typename DataType>
126 static double apply(
double* vptr,
double v)
128 std::atomic_ref<double> aref(*vptr);
129 double old = aref.load(std::memory_order_consume);
130 double wanted = old + v;
131 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
143 std::atomic_ref<Int64> aref(*vptr);
144 Int64 x = aref.fetch_add(v);
155 std::atomic_ref<Int32> aref(*vptr);
156 Int32 x = aref.fetch_add(v);
164template <
typename DataType>
169 static ARCCORE_DEVICE DataType
172 _applyDevice(dev_info);
175 static DataType apply(DataType* vptr, DataType v)
179#if defined(ARCANE_COMPILING_SYCL)
180 static sycl::plus<DataType> syclFunctor() {
return {}; }
195template <
typename DataType>
200 static ARCCORE_DEVICE DataType
203 _applyDevice(dev_info);
206 static DataType apply(DataType* ptr, DataType v)
208 std::atomic_ref<DataType> aref(*ptr);
209 DataType prev_value = aref.load();
210 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
214#if defined(ARCANE_COMPILING_SYCL)
215 static sycl::maximum<DataType> syclFunctor() {
return {}; }
230template <
typename DataType>
235 static ARCCORE_DEVICE DataType
238 _applyDevice(dev_info);
241 static DataType apply(DataType* vptr, DataType v)
243 std::atomic_ref<DataType> aref(*vptr);
244 DataType prev_value = aref.load();
245 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
249#if defined(ARCANE_COMPILING_SYCL)
250 static sycl::minimum<DataType> syclFunctor() {
return {}; }
296template <
typename DataType,
typename ReduceFunctor>
297class HostDeviceReducerBase
303 , m_command(&command)
306 m_is_master_instance =
true;
307 m_identity = ReduceFunctor::identity();
308 m_local_value = m_identity;
309 m_atomic_value = m_identity;
310 m_atomic_parent_value = &m_atomic_value;
312 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
315 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
319#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
320 HostDeviceReducerBase(
const HostDeviceReducerBase& rhs) =
default;
322 ARCCORE_HOST_DEVICE HostDeviceReducerBase(
const HostDeviceReducerBase& rhs)
324 , m_local_value(rhs.m_local_value)
325 , m_identity(rhs.m_identity)
327#ifdef ARCCORE_DEVICE_CODE
328 m_grid_memory_info = rhs.m_grid_memory_info;
333 m_memory_impl = rhs.m_memory_impl;
335 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
338 m_atomic_parent_value = rhs.m_atomic_parent_value;
339 m_local_value = rhs.m_identity;
340 m_atomic_value = m_identity;
349 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) =
delete;
350 HostDeviceReducerBase& operator=(
const HostDeviceReducerBase& rhs) =
delete;
354 ARCCORE_HOST_DEVICE
void setValue(DataType v)
358 ARCCORE_HOST_DEVICE DataType localValue()
const
360 return m_local_value;
381 mutable DataType m_local_value;
382 DataType* m_atomic_parent_value =
nullptr;
383 mutable DataType m_atomic_value;
389 bool m_is_master_instance =
false;
396 if (!m_is_master_instance)
397 ARCANE_FATAL(
"Final reduce operation is only valid on master instance");
401 m_memory_impl->copyReduceValueFromDevice();
402 final_ptr =
reinterpret_cast<DataType*
>(m_grid_memory_info.m_host_memory_for_reduced_value);
403 m_memory_impl->release();
404 m_memory_impl =
nullptr;
407 if (m_atomic_parent_value) {
411 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
412 *final_ptr = *m_atomic_parent_value;
424 ARCCORE_HOST_DEVICE
void
427#ifdef ARCCORE_DEVICE_CODE
432 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
443 ReduceFunctor::applyDevice(dvi);
451 if (!m_is_master_instance)
452 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
467template <
typename DataType,
typename ReduceFunctor>
468class HostDeviceReducer
469:
public HostDeviceReducerBase<DataType, ReduceFunctor>
473 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
477 explicit HostDeviceReducer(
RunCommand& command)
480 HostDeviceReducer(
const HostDeviceReducer& rhs) =
default;
481 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
493 DataType reducedValue()
504template <
typename DataType,
typename ReduceFunctor>
505class HostDeviceReducer2
506:
public HostDeviceReducerBase<DataType, ReduceFunctor>
509 friend ::Arcane::impl::HostReducerHelper;
513 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
514 using BaseClass::m_grid_memory_info;
516 using BaseClass::m_local_value;
520 explicit HostDeviceReducer2(
RunCommand& command)
526 DataType reducedValue()
536 void _internalReduceHost()
541#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
542 ARCCORE_HOST_DEVICE
void _internalExecWorkItem(Int32)
548#if defined(ARCANE_COMPILING_SYCL)
549 void _internalExecWorkItem(sycl::nd_item<1>
id)
551 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
552 const Int32 local_id =
static_cast<Int32>(
id.get_local_id(0));
553 const Int32 group_id =
static_cast<Int32>(
id.get_group_linear_id());
554 const Int32 nb_block =
static_cast<Int32>(
id.get_group_range(0));
556 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
557 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
558 SmallSpan<DataType> grid_buffer(buf,
static_cast<Int32>(buf_span.size()));
560 DataType v = m_local_value;
561 bool is_last =
false;
562 auto sycl_functor = ReduceFunctor::syclFunctor();
563 DataType local_sum = sycl::reduce_over_group(
id.get_group(), v, sycl_functor);
565 grid_buffer[group_id] = local_sum;
574#if defined(__ADAPTIVECPP__)
575 int* atomic_counter_ptr_as_int =
reinterpret_cast<int*
>(atomic_counter_ptr);
576 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
578 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
580 Int32 cx = a.fetch_add(1);
581 if (cx == (nb_block - 1))
588 DataType my_total = grid_buffer[0];
589 for (
int x = 1; x < nb_block; ++x)
590 my_total = sycl_functor(my_total, grid_buffer[x]);
592 grid_buffer[0] = my_total;
594 *atomic_counter_ptr = 0;
608template <
typename DataType,
typename ReduceFunctor>
619 return m_local_value;
621 void setValue(DataType v) { m_local_value = v; }
625 mutable DataType m_local_value = {};
631#if defined(ARCANE_COMPILING_SYCL)
645template <
typename DataType>
647:
public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
649 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
650 using BaseClass::m_local_value;
660 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
663 return m_local_value;
666 ARCCORE_HOST_DEVICE DataType add(DataType v)
const
677template <
typename DataType>
679:
public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
681 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
682 using BaseClass::m_local_value;
692 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
694 m_local_value = v > m_local_value ? v : m_local_value;
695 return m_local_value;
698 ARCCORE_HOST_DEVICE DataType max(DataType v)
const
712template <
typename DataType>
714:
public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
716 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
717 using BaseClass::m_local_value;
727 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
729 m_local_value = v < m_local_value ? v : m_local_value;
730 return m_local_value;
733 ARCCORE_HOST_DEVICE DataType min(DataType v)
const
747template <
typename DataType>
749:
public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
751 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
761 ARCCORE_HOST_DEVICE
void combine(DataType v)
763 this->m_local_value += v;
772template <
typename DataType>
774:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
776 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
786 ARCCORE_HOST_DEVICE
void combine(DataType v)
788 DataType& lv = this->m_local_value;
789 lv = v > lv ? v : lv;
798template <
typename DataType>
800:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
802 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
812 ARCCORE_HOST_DEVICE
void combine(DataType v)
814 DataType& lv = this->m_local_value;
815 lv = v < lv ? v : lv;
831#define ARCANE_INLINE_REDUCE_IMPL
833#ifdef ARCANE_INLINE_REDUCE_IMPL
835# ifndef ARCANE_INLINE_REDUCE
836# define ARCANE_INLINE_REDUCE inline
839#if defined(__CUDACC__) || defined(__HIP__)
840# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
855#include "arcane/accelerator/GenericReducer.h"
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType _reduce()
Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
Version 1 de la réduction.
Gestion d'une commande sur accélérateur.
Implémentation de la réduction pour le backend SYCL.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
unsigned int * m_device_count
bool m_use_grid_reduce
Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
DataType m_current_value
Valeur du thread courant à réduire.
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
DataType m_identity
Valeur de l'identité pour la réduction.
Vue d'un tableau d'éléments de type T.
Classe pour appliquer la finalisation des réductions.
Espace de nom pour l'utilisation des accélérateurs.
@ Atomic
Utilise des opérations atomiques entre les blocs.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.
Informations mémoire pour la réduction sur les accélérateurs.
void * m_host_memory_for_reduced_value
Pointeur vers la mémoire sur l'hôte contenant la valeur réduite.
eDeviceReducePolicy m_reduce_policy
Politique de réduction.
MutableMemoryView m_grid_memory_values
Mémoire allouée pour la réduction sur une grille (de taille nb_bloc * sizeof(T))
unsigned int * m_grid_device_count
Entier utilisé pour compter le nombre de blocs ayant déjà fait leur partie de la réduction.