12#ifndef ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
20#include "arcane/accelerator/CommonCudaHipAtomicImpl.h"
30namespace Arcane::Accelerator::impl
33__device__ __forceinline__
unsigned int getThreadId()
35 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
36 (blockDim.x * blockDim.y) * threadIdx.z;
40__device__ __forceinline__
unsigned int getBlockId()
42 int blockId = blockIdx.x + blockIdx.y * gridDim.x;
46constexpr const Int32 MAX_BLOCK_SIZE = 1024;
48template <
typename T, enum eAtomicOperation>
56 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
67 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
69 val = v > val ? v : val;
78 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
80 val = v < val ? v : val;
84#if defined(__CUDACC__)
85ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
87 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
90ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
92 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
95ARCCORE_DEVICE
inline Int64 shfl_xor_sync(
Int64 var,
int laneMask)
97 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
100ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
102 return ::__shfl_sync(0xffffffffu, var, laneMask);
105ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
107 return ::__shfl_sync(0xffffffffu, var, laneMask);
110ARCCORE_DEVICE
inline Int64 shfl_sync(
Int64 var,
int laneMask)
112 return ::__shfl_sync(0xffffffffu, var, laneMask);
116ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
118 return ::__shfl_xor(var, laneMask);
121ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
123 return ::__shfl_xor(var, laneMask);
126ARCCORE_DEVICE
inline Int64 shfl_xor_sync(
Int64 var,
int laneMask)
128 return ::__shfl_xor(var, laneMask);
131ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
133 return ::__shfl(var, laneMask);
136ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
138 return ::__shfl(var, laneMask);
141ARCCORE_DEVICE
inline Int64 shfl_sync(
Int64 var,
int laneMask)
143 return ::__shfl(var, laneMask);
151template <
typename ReduceOperator, Int32 WarpSize,
typename T>
152ARCCORE_DEVICE
inline T block_reduce(T val, T identity)
154 constexpr Int32 WARP_SIZE = WarpSize;
155 constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
156 int numThreads = blockDim.x * blockDim.y * blockDim.z;
158 int threadId = getThreadId();
160 int warpId = threadId % WARP_SIZE;
161 int warpNum = threadId / WARP_SIZE;
165 if (numThreads % WARP_SIZE == 0) {
168 for (
int i = 1; i < WARP_SIZE; i *= 2) {
169 T rhs = impl::shfl_xor_sync(temp, i);
170 ReduceOperator::apply(temp, rhs);
176 for (
int i = 1; i < WARP_SIZE; i *= 2) {
177 int srcLane = threadId ^ i;
178 T rhs = impl::shfl_sync(temp, srcLane);
180 if (srcLane < numThreads) {
181 ReduceOperator::apply(temp, rhs);
189 if (numThreads > WARP_SIZE) {
191 __shared__ T sd[MAX_WARPS];
203 if (warpId * WARP_SIZE < numThreads) {
209 for (
int i = 1; i < WARP_SIZE; i *= 2) {
210 T rhs = impl::shfl_xor_sync(temp, i);
211 ReduceOperator::apply(temp, rhs);
224template <
typename ReduceOperator, Int32 WarpSize,
typename T>
225ARCCORE_DEVICE
inline bool
226grid_reduce(T& val, T identity, SmallSpan<T> device_mem,
unsigned int* device_count)
228 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
229 int numThreads = blockDim.x * blockDim.y * blockDim.z;
230 int wrap_around = numBlocks - 1;
232 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
233 (gridDim.x * gridDim.y) * blockIdx.z;
235 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
236 (blockDim.x * blockDim.y) * threadIdx.z;
238 T temp = block_reduce<ReduceOperator, WarpSize>(val, identity);
241 bool lastBlock =
false;
243 device_mem[blockId] = temp;
251 unsigned int old_count = ::atomicInc(device_count, wrap_around);
252 lastBlock = ((int)old_count == wrap_around);
256 lastBlock = __syncthreads_or(lastBlock);
262 for (
int i = threadId; i < numBlocks; i += numThreads) {
263 ReduceOperator::apply(temp, device_mem[i]);
266 temp = block_reduce<ReduceOperator, WarpSize>(temp, identity);
274 return lastBlock && threadId == 0;
280template <
typename DataType,
typename ReduceOperator>
283 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
284 DataType identity = dev_info.m_identity;
285 unsigned int* device_count = dev_info.m_device_count;
286 DataType* ptr = dev_info.m_device_final_ptr;
287 DataType v = dev_info.m_current_value;
288#if HIP_VERSION_MAJOR >= 7
299 const Int32 warp_size = dev_info.m_warp_size;
302 constexpr const Int32 WARP_SIZE = warpSize;
304 constexpr const Int32 WARP_SIZE = 32;
313#if HIP_VERSION_MAJOR >= 7
314 bool is_done =
false;
316 is_done = grid_reduce<ReduceOperator, 64>(v, identity, grid_buffer, device_count);
317 else if (warp_size == 32)
318 is_done = grid_reduce<ReduceOperator, 32>(v, identity, grid_buffer, device_count);
320 assert(
"Bad warp size (should be 32 or 64)");
322 bool is_done = grid_reduce<ReduceOperator, WARP_SIZE>(v, identity, grid_buffer, device_count);
337 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
338 _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
347 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Max>;
348 _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
357 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
358 _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
Informations pour effectuer une réduction sur un device.
eAtomicOperation
Type d'opération atomique supportée.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.