12#ifndef ARCANE_ACCELERATOR_REDUCE_H
13#define ARCANE_ACCELERATOR_REDUCE_H
17#include "arcane/utils/ArrayView.h"
18#include "arcane/utils/String.h"
20#include "arcane/accelerator/core/IReduceMemoryImpl.h"
21#include "arcane/accelerator/AcceleratorGlobal.h"
22#include "arcane/accelerator/CommonUtils.h"
23#include "arcane/accelerator/RunCommandLaunchInfo.h"
39namespace Arcane::Accelerator::impl
41class KernelReducerHelper;
47internalGetOrCreateReduceMemoryImpl(RunCommand* command);
49template <
typename DataType>
57 ARCCORE_HOST_DEVICE
static constexpr double sumValue() {
return 0.0; }
58 ARCCORE_HOST_DEVICE
static constexpr double minValue() {
return DBL_MAX; }
59 ARCCORE_HOST_DEVICE
static constexpr double maxValue() {
return -DBL_MAX; }
66 ARCCORE_HOST_DEVICE
static constexpr Int32 sumValue() {
return 0; }
67 ARCCORE_HOST_DEVICE
static constexpr Int32 minValue() {
return INT32_MAX; }
68 ARCCORE_HOST_DEVICE
static constexpr Int32 maxValue() {
return -INT32_MAX; }
75 ARCCORE_HOST_DEVICE
static constexpr Int64 sumValue() {
return 0; }
76 ARCCORE_HOST_DEVICE
static constexpr Int64 minValue() {
return INT64_MAX; }
77 ARCCORE_HOST_DEVICE
static constexpr Int64 maxValue() {
return -INT64_MAX; }
90template <
typename DataType>
118template <
typename DataType>
126 static double apply(
double* vptr,
double v)
128 std::atomic_ref<double> aref(*vptr);
129 double old = aref.load(std::memory_order_consume);
130 double wanted = old + v;
131 while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))
143 std::atomic_ref<Int64> aref(*vptr);
144 Int64 x = aref.fetch_add(v);
155 std::atomic_ref<Int32> aref(*vptr);
156 Int32 x = aref.fetch_add(v);
164template <
typename DataType>
169 static ARCCORE_DEVICE DataType
172 _applyDevice(dev_info);
175 static DataType apply(DataType* vptr, DataType v)
179#if defined(ARCANE_COMPILING_SYCL)
180 static sycl::plus<DataType> syclFunctor() {
return {}; }
195template <
typename DataType>
200 static ARCCORE_DEVICE DataType
203 _applyDevice(dev_info);
206 static DataType apply(DataType* ptr, DataType v)
208 std::atomic_ref<DataType> aref(*ptr);
209 DataType prev_value = aref.load();
210 while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {
214#if defined(ARCANE_COMPILING_SYCL)
215 static sycl::maximum<DataType> syclFunctor() {
return {}; }
230template <
typename DataType>
235 static ARCCORE_DEVICE DataType
238 _applyDevice(dev_info);
241 static DataType apply(DataType* vptr, DataType v)
243 std::atomic_ref<DataType> aref(*vptr);
244 DataType prev_value = aref.load();
245 while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {
249#if defined(ARCANE_COMPILING_SYCL)
250 static sycl::minimum<DataType> syclFunctor() {
return {}; }
296template <
typename DataType,
typename ReduceFunctor>
297class HostDeviceReducerBase
303 , m_command(&command)
306 m_is_master_instance =
true;
307 m_identity = ReduceFunctor::identity();
308 m_local_value = m_identity;
309 m_atomic_value = m_identity;
310 m_atomic_parent_value = &m_atomic_value;
312 m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);
315 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
319#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)
320 HostDeviceReducerBase(
const HostDeviceReducerBase& rhs) =
default;
322 ARCCORE_HOST_DEVICE HostDeviceReducerBase(
const HostDeviceReducerBase& rhs)
324 , m_local_value(rhs.m_local_value)
325 , m_identity(rhs.m_identity)
327#ifdef ARCCORE_DEVICE_CODE
328 m_grid_memory_info = rhs.m_grid_memory_info;
333 m_memory_impl = rhs.m_memory_impl;
335 m_grid_memory_info = m_memory_impl->gridMemoryInfo();
338 m_atomic_parent_value = rhs.m_atomic_parent_value;
339 m_local_value = rhs.m_identity;
340 m_atomic_value = m_identity;
349 ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) =
delete;
350 HostDeviceReducerBase& operator=(
const HostDeviceReducerBase& rhs) =
delete;
354 ARCCORE_HOST_DEVICE
void setValue(DataType v)
358 ARCCORE_HOST_DEVICE DataType localValue()
const
360 return m_local_value;
381 mutable DataType m_local_value;
382 DataType* m_atomic_parent_value =
nullptr;
383 mutable DataType m_atomic_value;
389 bool m_is_master_instance =
false;
396 if (!m_is_master_instance)
397 ARCANE_FATAL(
"Final reduce operation is only valid on master instance");
401 m_memory_impl->copyReduceValueFromDevice();
402 final_ptr =
reinterpret_cast<DataType*
>(m_grid_memory_info.m_host_memory_for_reduced_value);
403 m_memory_impl->release();
404 m_memory_impl =
nullptr;
407 if (m_atomic_parent_value) {
411 ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);
412 *final_ptr = *m_atomic_parent_value;
424 ARCCORE_HOST_DEVICE
void
427#ifdef ARCCORE_DEVICE_CODE
432 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
443 ReduceFunctor::applyDevice(dvi);
451 if (!m_is_master_instance)
452 ReduceFunctor::apply(m_atomic_parent_value, m_local_value);
467template <
typename DataType,
typename ReduceFunctor>
468class HostDeviceReducer
469:
public HostDeviceReducerBase<DataType, ReduceFunctor>
473 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
477 explicit HostDeviceReducer(
RunCommand& command)
480 HostDeviceReducer(
const HostDeviceReducer& rhs) =
default;
481 ARCCORE_HOST_DEVICE ~HostDeviceReducer()
493 DataType reducedValue()
504template <
typename DataType,
typename ReduceFunctor>
505class HostDeviceReducer2
506:
public HostDeviceReducerBase<DataType, ReduceFunctor>
509 friend ::Arcane::impl::HostReducerHelper;
513 using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;
514 using BaseClass::m_grid_memory_info;
516 using BaseClass::m_local_value;
520 explicit HostDeviceReducer2(
RunCommand& command)
526 DataType reducedValue()
536 void _internalReduceHost()
541#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)
542 ARCCORE_HOST_DEVICE
void _internalExecWorkItem(Int32)
548#if defined(ARCANE_COMPILING_SYCL)
549 void _internalExecWorkItem(sycl::nd_item<1>
id)
551 unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;
552 const Int32 local_id =
static_cast<Int32>(
id.get_local_id(0));
553 const Int32 group_id =
static_cast<Int32>(
id.get_group_linear_id());
554 const Int32 nb_block =
static_cast<Int32>(
id.get_group_range(0));
556 auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();
557 DataType* buf =
reinterpret_cast<DataType*
>(buf_span.data());
558 SmallSpan<DataType> grid_buffer(buf,
static_cast<Int32>(buf_span.size()));
560 DataType v = m_local_value;
561 bool is_last =
false;
562 auto sycl_functor = ReduceFunctor::syclFunctor();
563 DataType local_sum = sycl::reduce_over_group(
id.get_group(), v, sycl_functor);
565 grid_buffer[group_id] = local_sum;
574#if defined(__ADAPTIVECPP__)
575 int* atomic_counter_ptr_as_int =
reinterpret_cast<int*
>(atomic_counter_ptr);
576 sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);
578 sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
580 Int32 cx = a.fetch_add(1);
581 if (cx == (nb_block - 1))
588 DataType my_total = grid_buffer[0];
589 for (
int x = 1; x < nb_block; ++x)
590 my_total = sycl_functor(my_total, grid_buffer[x]);
592 grid_buffer[0] = my_total;
594 *atomic_counter_ptr = 0;
608template <
typename DataType,
typename ReduceFunctor>
619 return m_local_value;
621 void setValue(DataType v) { m_local_value = v; }
625 mutable DataType m_local_value = {};
631#if defined(ARCANE_COMPILING_SYCL)
645template <
typename DataType>
647:
public Reducer<DataType, impl::ReduceFunctorSum<DataType>>
649 using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;
650 using BaseClass::m_local_value;
660 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
663 return m_local_value;
666 ARCCORE_HOST_DEVICE DataType add(DataType v)
const
677template <
typename DataType>
679:
public Reducer<DataType, impl::ReduceFunctorMax<DataType>>
681 using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;
682 using BaseClass::m_local_value;
692 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
694 m_local_value = v > m_local_value ? v : m_local_value;
695 return m_local_value;
698 ARCCORE_HOST_DEVICE DataType max(DataType v)
const
712template <
typename DataType>
714:
public Reducer<DataType, impl::ReduceFunctorMin<DataType>>
716 using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;
717 using BaseClass::m_local_value;
727 ARCCORE_HOST_DEVICE DataType combine(DataType v)
const
729 m_local_value = v < m_local_value ? v : m_local_value;
730 return m_local_value;
733 ARCCORE_HOST_DEVICE DataType min(DataType v)
const
747template <
typename DataType>
749:
public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>
751 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;
761 ARCCORE_HOST_DEVICE
void combine(DataType v)
763 this->m_local_value += v;
772template <
typename DataType>
774:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>
776 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;
786 ARCCORE_HOST_DEVICE
void combine(DataType v)
788 DataType& lv = this->m_local_value;
789 lv = v > lv ? v : lv;
798template <
typename DataType>
800:
public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>
802 using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;
812 ARCCORE_HOST_DEVICE
void combine(DataType v)
814 DataType& lv = this->m_local_value;
815 lv = v < lv ? v : lv;
831#define ARCANE_INLINE_REDUCE_IMPL
833#ifdef ARCANE_INLINE_REDUCE_IMPL
835# ifndef ARCANE_INLINE_REDUCE
836# define ARCANE_INLINE_REDUCE inline
839#if defined(__CUDACC__) || defined(__HIP__)
840# include "arcane/accelerator/CommonCudaHipReduceImpl.h"
855#include "arcane/accelerator/GenericReducer.h"
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Types et macros pour gérer les boucles sur les accélérateurs.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
DataType _reduce()
Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
Version 1 de la réduction.
Gestion d'une commande sur accélérateur.
Implémentation de la réduction pour le backend SYCL.
Interface de la gestion mémoire pour les réductions.
Classe pour appliquer la finalisation pour les arguments supplémentaires.
Informations pour effectuer une réduction sur un device.
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
unsigned int * m_device_count
bool m_use_grid_reduce
Indique si on utilise la réduction par grille (sinon on utilise les atomiques)
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
DataType m_current_value
Valeur du thread courant à réduire.
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
DataType m_identity
Valeur de l'identité pour la réduction.
Vue d'un tableau d'éléments de type T.
Classe pour appliquer la finalisation des réductions.
Espace de nom pour l'utilisation des accélérateurs.
@ Atomic
Utilise des opérations atomiques entre les blocs.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.
Informations mémoire pour la réduction sur les accélérateurs.
void * m_host_memory_for_reduced_value
Pointeur vers la mémoire sur l'hôte contenant la valeur réduite.
eDeviceReducePolicy m_reduce_policy
Politique de réduction.
MutableMemoryView m_grid_memory_values
Mémoire allouée pour la réduction sur une grille (de taille nb_bloc * sizeof(T))
unsigned int * m_grid_device_count
Entier utilisé pour compter le nombre de blocs ayant déjà fait leur partie de la réduction.