12#ifndef ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H 
   13#define ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H 
   20#include "arcane/accelerator/CommonCudaHipAtomicImpl.h" 
   30namespace Arcane::Accelerator::impl
 
   33__device__ __forceinline__ 
unsigned int getThreadId()
 
   35  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
 
   36  (blockDim.x * blockDim.y) * threadIdx.z;
 
   40__device__ __forceinline__ 
unsigned int getBlockId()
 
   42  int blockId = blockIdx.x + blockIdx.y * gridDim.x;
 
   46constexpr const Int32 MAX_BLOCK_SIZE = 1024;
 
   48template <
typename T, enum eAtomicOperation>
 
   56  static ARCCORE_DEVICE 
inline void apply(T& val, 
const T v)
 
 
   67  static ARCCORE_DEVICE 
inline void apply(T& val, 
const T v)
 
   69    val = v > val ? v : val;
 
 
   78  static ARCCORE_DEVICE 
inline void apply(T& val, 
const T v)
 
   80    val = v < val ? v : val;
 
 
   84#if defined(__CUDACC__) 
   85ARCCORE_DEVICE 
inline double shfl_xor_sync(
double var, 
int laneMask)
 
   87  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 
   90ARCCORE_DEVICE 
inline int shfl_xor_sync(
int var, 
int laneMask)
 
   92  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 
   95ARCCORE_DEVICE 
inline Int64 shfl_xor_sync(
Int64 var, 
int laneMask)
 
   97  return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
 
  100ARCCORE_DEVICE 
inline double shfl_sync(
double var, 
int laneMask)
 
  102  return ::__shfl_sync(0xffffffffu, var, laneMask);
 
  105ARCCORE_DEVICE 
inline int shfl_sync(
int var, 
int laneMask)
 
  107  return ::__shfl_sync(0xffffffffu, var, laneMask);
 
  110ARCCORE_DEVICE 
inline Int64 shfl_sync(
Int64 var, 
int laneMask)
 
  112  return ::__shfl_sync(0xffffffffu, var, laneMask);
 
  116ARCCORE_DEVICE 
inline double shfl_xor_sync(
double var, 
int laneMask)
 
  118  return ::__shfl_xor(var, laneMask);
 
  121ARCCORE_DEVICE 
inline int shfl_xor_sync(
int var, 
int laneMask)
 
  123  return ::__shfl_xor(var, laneMask);
 
  126ARCCORE_DEVICE 
inline Int64 shfl_xor_sync(
Int64 var, 
int laneMask)
 
  128  return ::__shfl_xor(var, laneMask);
 
  131ARCCORE_DEVICE 
inline double shfl_sync(
double var, 
int laneMask)
 
  133  return ::__shfl(var, laneMask);
 
  136ARCCORE_DEVICE 
inline int shfl_sync(
int var, 
int laneMask)
 
  138  return ::__shfl(var, laneMask);
 
  141ARCCORE_DEVICE 
inline Int64 shfl_sync(
Int64 var, 
int laneMask)
 
  143  return ::__shfl(var, laneMask);
 
  151template <
typename ReduceOperator, Int32 WarpSize, 
typename T>
 
  152ARCCORE_DEVICE 
inline T block_reduce(T val, T identity)
 
  154  constexpr Int32 WARP_SIZE = WarpSize;
 
  155  constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
 
  156  int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
  158  int threadId = getThreadId();
 
  160  int warpId = threadId % WARP_SIZE;
 
  161  int warpNum = threadId / WARP_SIZE;
 
  165  if (numThreads % WARP_SIZE == 0) {
 
  168    for (
int i = 1; i < WARP_SIZE; i *= 2) {
 
  169      T rhs = impl::shfl_xor_sync(temp, i);
 
  170      ReduceOperator::apply(temp, rhs);
 
  176    for (
int i = 1; i < WARP_SIZE; i *= 2) {
 
  177      int srcLane = threadId ^ i;
 
  178      T rhs = impl::shfl_sync(temp, srcLane);
 
  180      if (srcLane < numThreads) {
 
  181        ReduceOperator::apply(temp, rhs);
 
  189  if (numThreads > WARP_SIZE) {
 
  191    __shared__ T sd[MAX_WARPS];
 
  203      if (warpId * WARP_SIZE < numThreads) {
 
  209      for (
int i = 1; i < WARP_SIZE; i *= 2) {
 
  210        T rhs = impl::shfl_xor_sync(temp, i);
 
  211        ReduceOperator::apply(temp, rhs);
 
  224template <
typename ReduceOperator, Int32 WarpSize, 
typename T>
 
  225ARCCORE_DEVICE 
inline bool 
  226grid_reduce(T& val, T identity, SmallSpan<T> device_mem, 
unsigned int* device_count)
 
  228  int numBlocks = gridDim.x * gridDim.y * gridDim.z;
 
  229  int numThreads = blockDim.x * blockDim.y * blockDim.z;
 
  230  int wrap_around = numBlocks - 1;
 
  232  int blockId = blockIdx.x + gridDim.x * blockIdx.y +
 
  233  (gridDim.x * gridDim.y) * blockIdx.z;
 
  235  int threadId = threadIdx.x + blockDim.x * threadIdx.y +
 
  236  (blockDim.x * blockDim.y) * threadIdx.z;
 
  238  T temp = block_reduce<ReduceOperator, WarpSize>(val, identity);
 
  241  bool lastBlock = 
false;
 
  243    device_mem[blockId] = temp;
 
  251    unsigned int old_count = ::atomicInc(device_count, wrap_around);
 
  252    lastBlock = ((int)old_count == wrap_around);
 
  256  lastBlock = __syncthreads_or(lastBlock);
 
  262    for (
int i = threadId; i < numBlocks; i += numThreads) {
 
  263      ReduceOperator::apply(temp, device_mem[i]);
 
  266    temp = block_reduce<ReduceOperator, WarpSize>(temp, identity);
 
  274  return lastBlock && threadId == 0;
 
  280template <
typename DataType, 
typename ReduceOperator>
 
  283  SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
 
  284  DataType identity = dev_info.m_identity;
 
  285  unsigned int* device_count = dev_info.m_device_count;
 
  286  DataType* ptr = dev_info.m_device_final_ptr;
 
  287  DataType v = dev_info.m_current_value;
 
  288#if HIP_VERSION_MAJOR >= 7 
  299  const Int32 warp_size = dev_info.m_warp_size;
 
  302  constexpr const Int32 WARP_SIZE = warpSize;
 
  304  constexpr const Int32 WARP_SIZE = 32;
 
  313#if HIP_VERSION_MAJOR >= 7 
  314  bool is_done = 
false;
 
  316    is_done = grid_reduce<ReduceOperator, 64>(v, identity, grid_buffer, device_count);
 
  317  else if (warp_size == 32)
 
  318    is_done = grid_reduce<ReduceOperator, 32>(v, identity, grid_buffer, device_count);
 
  320    assert(
"Bad warp size (should be 32 or 64)");
 
  322  bool is_done = grid_reduce<ReduceOperator, WARP_SIZE>(v, identity, grid_buffer, device_count);
 
  337  using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
 
  338  _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
 
  347  using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Max>;
 
  348  _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
 
  357  using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
 
  358  _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
 
eAtomicOperation
Type d'opération atomique supportée.
 
std::int64_t Int64
Type entier signé sur 64 bits.
 
std::int32_t Int32
Type entier signé sur 32 bits.