df/dd2/Reduce_8h_source.html

// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-

//-----------------------------------------------------------------------------

// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)

// See the top-level COPYRIGHT file for details.

// SPDX-License-Identifier: Apache-2.0

//-----------------------------------------------------------------------------

/*---------------------------------------------------------------------------*/

/* Reduce.h                                                    (C) 2000-2025 */

/*                                                                           */

/* Gestion des réductions pour les accélérateurs.                            */

/*---------------------------------------------------------------------------*/

#ifndef ARCANE_ACCELERATOR_REDUCE_H

#define ARCANE_ACCELERATOR_REDUCE_H

/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#include "arcane/utils/ArrayView.h"

#include "arcane/utils/String.h"


#include "arcane/accelerator/core/IReduceMemoryImpl.h"

#include "arcane/accelerator/AcceleratorGlobal.h"

#include "arcane/accelerator/CommonUtils.h"

#include "arcane/accelerator/RunCommandLaunchInfo.h"

#include "arcane/accelerator/RunCommandLoop.h"


#include <limits.h>

#include <float.h>

#include <atomic>

#include <iostream>


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


namespace Arcane::impl

{

class HostReducerHelper;

}


namespace Arcane::Accelerator::impl

{

class KernelReducerHelper;


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


extern "C++" ARCANE_ACCELERATOR_CORE_EXPORT IReduceMemoryImpl*

internalGetOrCreateReduceMemoryImpl(RunCommand* command);


template <typename DataType>

class ReduceIdentity;

template <>

// TODO: utiliser numeric_limits.


class ReduceIdentity<double>

{

 public:


  ARCCORE_HOST_DEVICE static constexpr double sumValue() { return 0.0; }

  ARCCORE_HOST_DEVICE static constexpr double minValue() { return DBL_MAX; }

  ARCCORE_HOST_DEVICE static constexpr double maxValue() { return -DBL_MAX; }

};


template <>


class ReduceIdentity<Int32>

{

 public:


  ARCCORE_HOST_DEVICE static constexpr Int32 sumValue() { return 0; }

  ARCCORE_HOST_DEVICE static constexpr Int32 minValue() { return INT32_MAX; }

  ARCCORE_HOST_DEVICE static constexpr Int32 maxValue() { return -INT32_MAX; }

};


template <>


class ReduceIdentity<Int64>

{

 public:


  ARCCORE_HOST_DEVICE static constexpr Int64 sumValue() { return 0; }

  ARCCORE_HOST_DEVICE static constexpr Int64 minValue() { return INT64_MAX; }

  ARCCORE_HOST_DEVICE static constexpr Int64 maxValue() { return -INT64_MAX; }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

// L'implémentation utilisée est définie dans 'CommonCudaHipReduceImpl.h'


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType>


class ReduceDeviceInfo

{

 public:


  DataType m_current_value;

  DataType m_identity;

  DataType* m_device_final_ptr = nullptr;

  void* m_host_final_ptr = nullptr;

  SmallSpan<DataType> m_grid_buffer;

  unsigned int* m_device_count = nullptr;


  Int32 m_warp_size = 0;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename DataType>

class ReduceAtomicSum;


template <>


class ReduceAtomicSum<double>

{

 public:


  static double apply(double* vptr, double v)

  {

    std::atomic_ref<double> aref(*vptr);

    double old = aref.load(std::memory_order_consume);

    double wanted = old + v;

    while (!aref.compare_exchange_weak(old, wanted, std::memory_order_release, std::memory_order_consume))

      wanted = old + v;

    return wanted;

  }

};


template <>


class ReduceAtomicSum<Int64>

{

 public:


  static Int64 apply(Int64* vptr, Int64 v)

  {

    std::atomic_ref<Int64> aref(*vptr);

    Int64 x = aref.fetch_add(v);

    return x + v;

  }

};


template <>


class ReduceAtomicSum<Int32>

{

 public:


  static Int32 apply(Int32* vptr, Int32 v)

  {

    std::atomic_ref<Int32> aref(*vptr);

    Int32 x = aref.fetch_add(v);

    return x + v;

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename DataType>


class ReduceFunctorSum

{

 public:


  static ARCCORE_DEVICE DataType

  applyDevice(const ReduceDeviceInfo<DataType>& dev_info)

  {

    _applyDevice(dev_info);

    return *(dev_info.m_device_final_ptr);

  }

  static DataType apply(DataType* vptr, DataType v)

  {

    return ReduceAtomicSum<DataType>::apply(vptr, v);

  }

#if defined(ARCANE_COMPILING_SYCL)

  static sycl::plus<DataType> syclFunctor() { return {}; }

#endif


 public:


  ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::sumValue(); }


 private:


  static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename DataType>


class ReduceFunctorMax

{

 public:


  static ARCCORE_DEVICE DataType

  applyDevice(const ReduceDeviceInfo<DataType>& dev_info)

  {

    _applyDevice(dev_info);

    return *(dev_info.m_device_final_ptr);

  }

  static DataType apply(DataType* ptr, DataType v)

  {

    std::atomic_ref<DataType> aref(*ptr);

    DataType prev_value = aref.load();

    while (prev_value < v && !aref.compare_exchange_weak(prev_value, v)) {

    }

    return aref.load();

  }

#if defined(ARCANE_COMPILING_SYCL)

  static sycl::maximum<DataType> syclFunctor() { return {}; }

#endif


 public:


  ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::maxValue(); }


 private:


  static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename DataType>


class ReduceFunctorMin

{

 public:


  static ARCCORE_DEVICE DataType

  applyDevice(const ReduceDeviceInfo<DataType>& dev_info)

  {

    _applyDevice(dev_info);

    return *(dev_info.m_device_final_ptr);

  }

  static DataType apply(DataType* vptr, DataType v)

  {

    std::atomic_ref<DataType> aref(*vptr);

    DataType prev_value = aref.load();

    while (prev_value > v && !aref.compare_exchange_weak(prev_value, v)) {

    }

    return aref.load();

  }

#if defined(ARCANE_COMPILING_SYCL)

  static sycl::minimum<DataType> syclFunctor() { return {}; }

#endif


 public:


  ARCCORE_HOST_DEVICE static constexpr DataType identity() { return impl::ReduceIdentity<DataType>::minValue(); }


 private:


  static ARCCORE_DEVICE void _applyDevice(const ReduceDeviceInfo<DataType>& dev_info);

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


} // namespace Arcane::Accelerator::impl


namespace Arcane::Accelerator

{


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename DataType, typename ReduceFunctor>


class HostDeviceReducerBase

{

 public:


  HostDeviceReducerBase(RunCommand& command)

  : m_host_or_device_memory_for_reduced_value(&m_local_value)

  , m_command(&command)

  {

    //std::cout << String::format("Reduce main host this={0}\n",this); std::cout.flush();

    m_is_master_instance = true;

    m_identity = ReduceFunctor::identity();

    m_local_value = m_identity;

    m_atomic_value = m_identity;

    m_atomic_parent_value = &m_atomic_value;

    //printf("Create null host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this);

    m_memory_impl = impl::internalGetOrCreateReduceMemoryImpl(&command);

    if (m_memory_impl) {

      m_host_or_device_memory_for_reduced_value = impl::allocateReduceDataMemory<DataType>(m_memory_impl, m_identity);

      m_grid_memory_info = m_memory_impl->gridMemoryInfo();

    }

  }


#if defined(__INTEL_LLVM_COMPILER) && defined(__SYCL_DEVICE_ONLY__)

  HostDeviceReducerBase(const HostDeviceReducerBase& rhs) = default;

#else

  ARCCORE_HOST_DEVICE HostDeviceReducerBase(const HostDeviceReducerBase& rhs)

  : m_host_or_device_memory_for_reduced_value(rhs.m_host_or_device_memory_for_reduced_value)

  , m_local_value(rhs.m_local_value)

  , m_identity(rhs.m_identity)

  {

#ifdef ARCCORE_DEVICE_CODE

    m_grid_memory_info = rhs.m_grid_memory_info;

    //int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;

    //if (threadId==0)

    //printf("Create ref device Id=%d parent=%p\n",threadId,&rhs);

#else

    m_memory_impl = rhs.m_memory_impl;

    if (m_memory_impl) {

      m_grid_memory_info = m_memory_impl->gridMemoryInfo();

    }

    //std::cout << String::format("Reduce: host copy this={0} rhs={1} mem={2} device_count={3}\n",this,&rhs,m_memory_impl,(void*)m_grid_device_count);

    m_atomic_parent_value = rhs.m_atomic_parent_value;

    m_local_value = rhs.m_identity;

    m_atomic_value = m_identity;

    //std::cout << String::format("Reduce copy host  this={0} parent_value={1} rhs={2}\n",this,(void*)m_parent_value,&rhs); std::cout.flush();

    //if (!rhs.m_is_master_instance)

    //ARCANE_FATAL("Only copy from master instance is allowed");

    //printf("Create ref host parent_value=%p this=%p rhs=%p\n",(void*)m_parent_value,(void*)this,(void*)&rhs);

#endif

  }

#endif


  ARCCORE_HOST_DEVICE HostDeviceReducerBase(HostDeviceReducerBase&& rhs) = delete;

  HostDeviceReducerBase& operator=(const HostDeviceReducerBase& rhs) = delete;


 public:


  ARCCORE_HOST_DEVICE void setValue(DataType v)

  {

    m_local_value = v;

  }

  ARCCORE_HOST_DEVICE DataType localValue() const

  {

    return m_local_value;

  }


 protected:


  impl::IReduceMemoryImpl* m_memory_impl = nullptr;

  DataType* m_host_or_device_memory_for_reduced_value = nullptr;

  impl::IReduceMemoryImpl::GridMemoryInfo m_grid_memory_info;


 private:


  RunCommand* m_command = nullptr;


 protected:


  mutable DataType m_local_value;

  DataType* m_atomic_parent_value = nullptr;

  mutable DataType m_atomic_value;


 private:


  DataType m_identity;

  //bool m_is_allocated = false;

  bool m_is_master_instance = false;


 protected:


  DataType _reduce()

  {

    if (!m_is_master_instance)

      ARCANE_FATAL("Final reduce operation is only valid on master instance");

    // Si la réduction est faite sur accélérateur, il faut recopier la valeur du device sur l'hôte.

    DataType* final_ptr = m_host_or_device_memory_for_reduced_value;

    if (m_memory_impl) {

      m_memory_impl->copyReduceValueFromDevice();

      final_ptr = reinterpret_cast<DataType*>(m_grid_memory_info.m_host_memory_for_reduced_value);

      m_memory_impl->release();

      m_memory_impl = nullptr;

    }


    if (m_atomic_parent_value) {

      //std::cout << String::format("Reduce host has parent this={0} local_value={1} parent_value={2}\n",

      //                            this,m_local_value,*m_parent_value);

      //std::cout.flush();

      ReduceFunctor::apply(m_atomic_parent_value, *final_ptr);

      *final_ptr = *m_atomic_parent_value;

    }

    else {

      //std::cout << String::format("Reduce host no parent this={0} local_value={1} managed={2}\n",

      //                            this,m_local_value,*m_host_or_device_memory_for_reduced_value);

      //std::cout.flush();

    }

    return *final_ptr;

  }


  // NOTE: Lorsqu'il n'y aura plus la version V1 de la réduction, cette méthode ne sera

  // appelée que depuis le device.

  ARCCORE_HOST_DEVICE void

  _finalize()

  {

#ifdef ARCCORE_DEVICE_CODE

    //int threadId = threadIdx.x + blockDim.x * threadIdx.y + (blockDim.x * blockDim.y) * threadIdx.z;

    //if ((threadId%16)==0)

    //printf("Destroy device Id=%d\n",threadId);

    auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();

    DataType* buf = reinterpret_cast<DataType*>(buf_span.data());

    SmallSpan<DataType> grid_buffer(buf, static_cast<Int32>(buf_span.size()));


    impl::ReduceDeviceInfo<DataType> dvi;

    dvi.m_grid_buffer = grid_buffer;

    dvi.m_device_count = m_grid_memory_info.m_grid_device_count;

    dvi.m_device_final_ptr = m_host_or_device_memory_for_reduced_value;

    dvi.m_host_final_ptr = m_grid_memory_info.m_host_memory_for_reduced_value;

    dvi.m_current_value = m_local_value;

    dvi.m_identity = m_identity;

    dvi.m_warp_size = m_grid_memory_info.m_warp_size;

    ReduceFunctor::applyDevice(dvi); //grid_buffer,m_grid_device_count,m_host_or_device_memory_for_reduced_value,m_local_value,m_identity);

#else

    //      printf("Destroy host parent_value=%p this=%p\n",(void*)m_parent_value,(void*)this);

    // Code hôte

    //std::cout << String::format("Reduce destructor this={0} parent_value={1} v={2} memory_impl={3}\n",this,(void*)m_parent_value,m_local_value,m_memory_impl);

    //std::cout << String::format("Reduce destructor this={0} grid_data={1} grid_size={2}\n",

    //                            this,(void*)m_grid_memory_value_as_bytes,m_grid_memory_size);

    //std::cout.flush();

    if (!m_is_master_instance)

      ReduceFunctor::apply(m_atomic_parent_value, m_local_value);


    //printf("Destroy host %p %p\n",m_host_or_device_memory_for_reduced_value,this);

#endif

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType, typename ReduceFunctor>


class HostDeviceReducer

: public HostDeviceReducerBase<DataType, ReduceFunctor>

{

 public:


  using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;


 public:


  explicit HostDeviceReducer(RunCommand& command)

  : BaseClass(command)

  {}

  HostDeviceReducer(const HostDeviceReducer& rhs) = default;

  ARCCORE_HOST_DEVICE ~HostDeviceReducer()

  {

    this->_finalize();

  }


 public:


  DataType reduce()

  {

    return this->_reduce();

  }


  DataType reducedValue()

  {

    return this->_reduce();

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType, typename ReduceFunctor>


class HostDeviceReducer2

: public HostDeviceReducerBase<DataType, ReduceFunctor>

{

  friend impl::KernelRemainingArgsHelper;

  friend ::Arcane::impl::HostReducerHelper;


 public:


  using BaseClass = HostDeviceReducerBase<DataType, ReduceFunctor>;

  using BaseClass::m_grid_memory_info;

  using BaseClass::m_host_or_device_memory_for_reduced_value;

  using BaseClass::m_local_value;


 public:


  explicit HostDeviceReducer2(RunCommand& command)

  : BaseClass(command)

  {}


 public:


  DataType reducedValue()

  {

    return this->_reduce();

  }


 private:


  // Note: les méthodes _internalReduce...() sont

  // internes à Arcane.


  void _internalReduceHost()

  {

    this->_finalize();

  }


#if defined(ARCANE_COMPILING_CUDA) || defined(ARCANE_COMPILING_HIP)

  ARCCORE_HOST_DEVICE void _internalExecWorkItem(Int32)

  {

    this->_finalize();

  };

#endif


#if defined(ARCANE_COMPILING_SYCL)

  void _internalExecWorkItem(sycl::nd_item<1> id)

  {

    unsigned int* atomic_counter_ptr = m_grid_memory_info.m_grid_device_count;

    const Int32 local_id = static_cast<Int32>(id.get_local_id(0));

    const Int32 group_id = static_cast<Int32>(id.get_group_linear_id());

    const Int32 nb_block = static_cast<Int32>(id.get_group_range(0));


    auto buf_span = m_grid_memory_info.m_grid_memory_values.bytes();

    DataType* buf = reinterpret_cast<DataType*>(buf_span.data());

    SmallSpan<DataType> grid_buffer(buf, static_cast<Int32>(buf_span.size()));


    DataType v = m_local_value;

    bool is_last = false;

    auto sycl_functor = ReduceFunctor::syclFunctor();

    DataType local_sum = sycl::reduce_over_group(id.get_group(), v, sycl_functor);

    if (local_id == 0) {

      grid_buffer[group_id] = local_sum;


      // TODO: En théorie il faut faire l'équivalent d'un __threadfence() ici

      // pour garantir que les autres work-item voient bien la mise à jour de 'grid_buffer'.

      // Mais ce mécanisme n'existe pas avec SYCL 2020.


      // AdaptiveCpp 2024.2 ne supporte pas les opérations atomiques sur 'unsigned int'.

      // Elles sont supportées avec le type 'int'. Comme on est certain de ne pas dépasser 2^31, on

      // converti le pointeur en un 'int*'.

#if defined(__ADAPTIVECPP__)

      int* atomic_counter_ptr_as_int = reinterpret_cast<int*>(atomic_counter_ptr);

      sycl::atomic_ref<int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr_as_int);

#else

      sycl::atomic_ref<unsigned int, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);

#endif

      Int32 cx = a.fetch_add(1);

      if (cx == (nb_block - 1))

        is_last = true;

    }


    // Je suis le dernier à faire la réduction.

    // Calcule la réduction finale

    if (is_last) {

      DataType my_total = grid_buffer[0];

      for (int x = 1; x < nb_block; ++x)

        my_total = sycl_functor(my_total, grid_buffer[x]);

      // Met le résultat final dans le premier élément du tableau.

      grid_buffer[0] = my_total;

      *m_host_or_device_memory_for_reduced_value = my_total;

      *atomic_counter_ptr = 0;

    }

  }

#endif

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType, typename ReduceFunctor>


class SyclReducer

{

 public:


  explicit SyclReducer(RunCommand&) {}


 public:


  DataType reduce()

  {

    return m_local_value;

  }

  void setValue(DataType v) { m_local_value = v; }


 protected:


  mutable DataType m_local_value = {};

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#if defined(ARCANE_COMPILING_SYCL)

template <typename DataType, typename ReduceFunctor> using Reducer = SyclReducer<DataType, ReduceFunctor>;

#else

template <typename DataType, typename ReduceFunctor> using Reducer = HostDeviceReducer<DataType, ReduceFunctor>;

#endif


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType>


class ReducerSum

: public Reducer<DataType, impl::ReduceFunctorSum<DataType>>

{

  using BaseClass = Reducer<DataType, impl::ReduceFunctorSum<DataType>>;

  using BaseClass::m_local_value;


 public:


  explicit ReducerSum(RunCommand& command)

  : BaseClass(command)

  {}


 public:


  ARCCORE_HOST_DEVICE DataType combine(DataType v) const

  {

    m_local_value += v;

    return m_local_value;

  }


  ARCCORE_HOST_DEVICE DataType add(DataType v) const

  {

    return combine(v);

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType>


class ReducerMax

: public Reducer<DataType, impl::ReduceFunctorMax<DataType>>

{

  using BaseClass = Reducer<DataType, impl::ReduceFunctorMax<DataType>>;

  using BaseClass::m_local_value;


 public:


  explicit ReducerMax(RunCommand& command)

  : BaseClass(command)

  {}


 public:


  ARCCORE_HOST_DEVICE DataType combine(DataType v) const

  {

    m_local_value = v > m_local_value ? v : m_local_value;

    return m_local_value;

  }


  ARCCORE_HOST_DEVICE DataType max(DataType v) const

  {

    return combine(v);

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType>


class ReducerMin

: public Reducer<DataType, impl::ReduceFunctorMin<DataType>>

{

  using BaseClass = Reducer<DataType, impl::ReduceFunctorMin<DataType>>;

  using BaseClass::m_local_value;


 public:


  explicit ReducerMin(RunCommand& command)

  : BaseClass(command)

  {}


 public:


  ARCCORE_HOST_DEVICE DataType combine(DataType v) const

  {

    m_local_value = v < m_local_value ? v : m_local_value;

    return m_local_value;

  }


  ARCCORE_HOST_DEVICE DataType min(DataType v) const

  {

    return combine(v);

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType>


class ReducerSum2

: public HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>

{

  using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorSum<DataType>>;


 public:


  explicit ReducerSum2(RunCommand& command)

  : BaseClass(command)

  {}


 public:


  ARCCORE_HOST_DEVICE void combine(DataType v)

  {

    this->m_local_value += v;

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType>


class ReducerMax2

: public HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>

{

  using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMax<DataType>>;


 public:


  explicit ReducerMax2(RunCommand& command)

  : BaseClass(command)

  {}


 public:


  ARCCORE_HOST_DEVICE void combine(DataType v)

  {

    DataType& lv = this->m_local_value;

    lv = v > lv ? v : lv;

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename DataType>


class ReducerMin2

: public HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>

{

  using BaseClass = HostDeviceReducer2<DataType, impl::ReduceFunctorMin<DataType>>;


 public:


  explicit ReducerMin2(RunCommand& command)

  : BaseClass(command)

  {}


 public:


  ARCCORE_HOST_DEVICE void combine(DataType v)

  {

    DataType& lv = this->m_local_value;

    lv = v < lv ? v : lv;

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


} // End namespace Arcane::Accelerator


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

// Cette macro est définie si on souhaite rendre inline l'implémentation.

// Dans l'idéal il ne faut pas que ce soit le cas (ce qui permettrait de

// changer l'implémentation sans tout recompiler) mais cela ne semble pas

// bien fonctionner pour l'instant.


#define ARCANE_INLINE_REDUCE_IMPL


#ifdef ARCANE_INLINE_REDUCE_IMPL


#  ifndef ARCANE_INLINE_REDUCE

#    define ARCANE_INLINE_REDUCE inline

#  endif


#if defined(__CUDACC__) || defined(__HIP__)

#  include "arcane/accelerator/CommonCudaHipReduceImpl.h"

#else


#endif


#endif


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#endif


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#include "arcane/accelerator/GenericReducer.h"


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

ARCANE_FATAL
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Definition ArcaneGlobal.h:768

RunCommandLoop.h
Types et macros pour gérer les boucles sur les accélérateurs.

Arcane::Accelerator::HostDeviceReducer2::m_host_or_device_memory_for_reduced_value
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
Definition Reduce.h:372

Arcane::Accelerator::HostDeviceReducerBase::m_host_or_device_memory_for_reduced_value
DataType * m_host_or_device_memory_for_reduced_value
Pointeur vers la donnée qui contiendra la valeur réduite.
Definition Reduce.h:372

Arcane::Accelerator::HostDeviceReducerBase::_reduce
DataType _reduce()
Effectue la réduction et récupère la valeur. ATTENTION: ne faire qu'une seule fois.
Definition Reduce.h:394

Arcane::Accelerator::HostDeviceReducer
Version 1 de la réduction.
Definition Reduce.h:470

Arcane::Accelerator::RunCommand
Gestion d'une commande sur accélérateur.
Definition core/RunCommand.h:46

Arcane::Accelerator::SyclReducer
Implémentation de la réduction pour le backend SYCL.
Definition Reduce.h:610

Arcane::Accelerator::impl::IReduceMemoryImpl
Interface de la gestion mémoire pour les réductions.
Definition core/IReduceMemoryImpl.h:35

Arcane::Accelerator::impl::KernelRemainingArgsHelper
Classe pour appliquer la finalisation pour les arguments supplémentaires.
Definition KernelLauncher.h:75

Arcane::Accelerator::impl::ReduceAtomicSum
Definition Reduce.h:119

Arcane::Accelerator::impl::ReduceDeviceInfo
Informations pour effectuer une réduction sur un device.
Definition Reduce.h:92

Arcane::Accelerator::impl::ReduceDeviceInfo::m_device_final_ptr
DataType * m_device_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis le device)
Definition Reduce.h:100

Arcane::Accelerator::impl::ReduceDeviceInfo::m_device_count
unsigned int * m_device_count
Definition Reduce.h:109

Arcane::Accelerator::impl::ReduceDeviceInfo::m_host_final_ptr
void * m_host_final_ptr
Pointeur vers la donnée réduite (mémoire uniquement accessible depuis l'hôte)
Definition Reduce.h:102

Arcane::Accelerator::impl::ReduceDeviceInfo::m_current_value
DataType m_current_value
Valeur du thread courant à réduire.
Definition Reduce.h:96

Arcane::Accelerator::impl::ReduceDeviceInfo::m_warp_size
Int32 m_warp_size
Taille d'un warp.
Definition Reduce.h:112

Arcane::Accelerator::impl::ReduceDeviceInfo::m_grid_buffer
SmallSpan< DataType > m_grid_buffer
Tableau avec une valeur par bloc pour la réduction.
Definition Reduce.h:104

Arcane::Accelerator::impl::ReduceDeviceInfo::m_identity
DataType m_identity
Valeur de l'identité pour la réduction.
Definition Reduce.h:98

Arcane::Accelerator::impl::ReduceFunctorMax
Definition Reduce.h:197

Arcane::Accelerator::impl::ReduceFunctorMin
Definition Reduce.h:232

Arcane::Accelerator::impl::ReduceFunctorSum
Definition Reduce.h:166

Arcane::Accelerator::impl::ReduceIdentity
Definition Reduce.h:50

Arcane::MutableMemoryView::bytes
constexpr SpanType bytes() const
Vue sous forme d'octets.
Definition MemoryView.h:215

Arcane::SmallSpan
Vue d'un tableau d'éléments de type T.
Definition Span.h:775

Arcane::impl::HostReducerHelper
Classe pour appliquer la finalisation des réductions.
Definition ForLoopRanges.h:28

Arcane::Accelerator
Espace de nom pour l'utilisation des accélérateurs.
Definition AcceleratorGlobal.h:36

Arcane::Int64
std::int64_t Int64
Type entier signé sur 64 bits.
Definition ArccoreGlobal.h:186

Arcane::Int32
std::int32_t Int32
Type entier signé sur 32 bits.
Definition ArccoreGlobal.h:184

Arcane::Accelerator::impl::IReduceMemoryImpl::GridMemoryInfo
Informations mémoire pour la réduction sur les accélérateurs.
Definition core/IReduceMemoryImpl.h:40

Arcane::Accelerator::impl::IReduceMemoryImpl::GridMemoryInfo::m_host_memory_for_reduced_value
void * m_host_memory_for_reduced_value
Pointeur vers la mémoire sur l'hôte contenant la valeur réduite.
Definition core/IReduceMemoryImpl.h:46

Arcane::Accelerator::impl::IReduceMemoryImpl::GridMemoryInfo::m_grid_memory_values
MutableMemoryView m_grid_memory_values
Mémoire allouée pour la réduction sur une grille (de taille nb_bloc * sizeof(T))
Definition core/IReduceMemoryImpl.h:42

Arcane::Accelerator::impl::IReduceMemoryImpl::GridMemoryInfo::m_warp_size
Int32 m_warp_size
Taille d'un warp.
Definition core/IReduceMemoryImpl.h:48

Arcane::Accelerator::impl::IReduceMemoryImpl::GridMemoryInfo::m_grid_device_count
unsigned int * m_grid_device_count
Entier utilisé pour compter le nombre de blocs ayant déjà fait leur partie de la réduction.
Definition core/IReduceMemoryImpl.h:44