12#ifndef ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
20#include "arccore/accelerator/CommonCudaHipAtomicImpl.h"
30namespace Arcane::Accelerator::impl
33__device__ __forceinline__
unsigned int getThreadId()
35 int threadId = threadIdx.x;
39__device__ __forceinline__
unsigned int getBlockId()
41 int blockId = blockIdx.x;
45constexpr const Int32 MAX_BLOCK_SIZE = 1024;
47template <
typename T, enum eAtomicOperation>
55 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
66 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
68 val = v > val ? v : val;
77 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
79 val = v < val ? v : val;
83#if defined(__CUDACC__)
84ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
86 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
89ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
91 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
94ARCCORE_DEVICE
inline Int64 shfl_xor_sync(
Int64 var,
int laneMask)
96 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
99ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
101 return ::__shfl_sync(0xffffffffu, var, laneMask);
104ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
106 return ::__shfl_sync(0xffffffffu, var, laneMask);
109ARCCORE_DEVICE
inline Int64 shfl_sync(
Int64 var,
int laneMask)
111 return ::__shfl_sync(0xffffffffu, var, laneMask);
115ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
117 return ::__shfl_xor(var, laneMask);
120ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
122 return ::__shfl_xor(var, laneMask);
125ARCCORE_DEVICE
inline Int64 shfl_xor_sync(
Int64 var,
int laneMask)
127 return ::__shfl_xor(var, laneMask);
130ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
132 return ::__shfl(var, laneMask);
135ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
137 return ::__shfl(var, laneMask);
140ARCCORE_DEVICE
inline Int64 shfl_sync(
Int64 var,
int laneMask)
142 return ::__shfl(var, laneMask);
150template <
typename ReduceOperator, Int32 WarpSize,
typename T, T
identity>
151ARCCORE_DEVICE
inline T block_reduce(T val)
153 constexpr Int32 WARP_SIZE = WarpSize;
154 constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
155 int numThreads = blockDim.x;
157 int threadId = getThreadId();
159 int warpId = threadId % WARP_SIZE;
160 int warpNum = threadId / WARP_SIZE;
164 if (numThreads % WARP_SIZE == 0) {
167 for (
int i = 1; i < WARP_SIZE; i *= 2) {
168 T rhs = impl::shfl_xor_sync(temp, i);
169 ReduceOperator::apply(temp, rhs);
175 for (
int i = 1; i < WARP_SIZE; i *= 2) {
176 int srcLane = threadId ^ i;
177 T rhs = impl::shfl_sync(temp, srcLane);
179 if (srcLane < numThreads) {
180 ReduceOperator::apply(temp, rhs);
188 if (numThreads > WARP_SIZE) {
190 __shared__ T sd[MAX_WARPS];
202 if (warpId * WARP_SIZE < numThreads) {
208 for (
int i = 1; i < WARP_SIZE; i *= 2) {
209 T rhs = impl::shfl_xor_sync(temp, i);
210 ReduceOperator::apply(temp, rhs);
223template <
typename ReduceOperator, Int32 WarpSize,
typename T, T
identity>
224ARCCORE_DEVICE
inline bool
225grid_reduce(T& val, SmallSpan<T> device_mem,
unsigned int* device_count)
227 int numBlocks = gridDim.x;
228 int numThreads = blockDim.x;
229 int wrap_around = numBlocks - 1;
230 int blockId = blockIdx.x;
231 int threadId = threadIdx.x;
233 T temp = block_reduce<ReduceOperator, WarpSize, T, identity>(val);
236 bool lastBlock =
false;
238 device_mem[blockId] = temp;
246 unsigned int old_count = ::atomicInc(device_count, wrap_around);
247 lastBlock = ((int)old_count == wrap_around);
251 lastBlock = __syncthreads_or(lastBlock);
257 for (
int i = threadId; i < numBlocks; i += numThreads) {
258 ReduceOperator::apply(temp, device_mem[i]);
261 temp = block_reduce<ReduceOperator, WarpSize, T, identity>(temp);
269 return lastBlock && threadId == 0;
275template <
typename DataType,
typename ReduceOperator, DataType Identity>
276ARCCORE_INLINE_REDUCE ARCCORE_DEVICE
void
279 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
280 unsigned int* device_count = dev_info.m_device_count;
281 DataType* host_pinned_ptr = dev_info.m_host_pinned_final_ptr;
282 DataType v = dev_info.m_current_value;
283#if HIP_VERSION_MAJOR >= 7
294 const Int32 warp_size = dev_info.m_warp_size;
297 constexpr const Int32 WARP_SIZE = warpSize;
299 constexpr const Int32 WARP_SIZE = 32;
308#if HIP_VERSION_MAJOR >= 7
309 bool is_done =
false;
311 is_done = grid_reduce<ReduceOperator, 64, DataType, Identity>(v, grid_buffer, device_count);
312 else if (warp_size == 32)
313 is_done = grid_reduce<ReduceOperator, 32, DataType, Identity>(v, grid_buffer, device_count);
315 assert(
"Bad warp size (should be 32 or 64)");
317 bool is_done = grid_reduce<ReduceOperator, WARP_SIZE, DataType, Identity>(v, grid_buffer, device_count);
320 *host_pinned_ptr = v;
332 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
333 _applyDeviceGeneric<DataType, ReduceOperator, identity()>(dev_info);
342 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Max>;
343 _applyDeviceGeneric<DataType, ReduceOperator, identity()>(dev_info);
352 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
353 _applyDeviceGeneric<DataType, ReduceOperator, identity()>(dev_info);
eAtomicOperation
Type d'opération atomique supportée.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.