12#ifndef ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
20#include "arccore/accelerator/AcceleratorGlobal.h"
30namespace Arcane::Accelerator::Impl
33__device__ __forceinline__
unsigned int getThreadId()
35 int threadId = threadIdx.x;
39__device__ __forceinline__
unsigned int getBlockId()
41 int blockId = blockIdx.x;
45constexpr const Int32 MAX_BLOCK_SIZE = 1024;
47#if defined(__CUDACC__)
48ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
50 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
53ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
55 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
58ARCCORE_DEVICE
inline Int64 shfl_xor_sync(
Int64 var,
int laneMask)
60 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
63ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
65 return ::__shfl_sync(0xffffffffu, var, laneMask);
68ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
70 return ::__shfl_sync(0xffffffffu, var, laneMask);
73ARCCORE_DEVICE
inline Int64 shfl_sync(
Int64 var,
int laneMask)
75 return ::__shfl_sync(0xffffffffu, var, laneMask);
79ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
81 return ::__shfl_xor(var, laneMask);
84ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
86 return ::__shfl_xor(var, laneMask);
89ARCCORE_DEVICE
inline Int64 shfl_xor_sync(
Int64 var,
int laneMask)
91 return ::__shfl_xor(var, laneMask);
94ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
96 return ::__shfl(var, laneMask);
99ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
101 return ::__shfl(var, laneMask);
104ARCCORE_DEVICE
inline Int64 shfl_sync(
Int64 var,
int laneMask)
106 return ::__shfl(var, laneMask);
115template <
typename ReduceOperator, Int32 WarpSize,
typename T>
116ARCCORE_DEVICE
inline T block_reduce(T val)
118 constexpr Int32 WARP_SIZE = WarpSize;
119 constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
120 int numThreads = blockDim.x;
122 int threadId = getThreadId();
124 int warpId = threadId % WARP_SIZE;
125 int warpNum = threadId / WARP_SIZE;
129 if (numThreads % WARP_SIZE == 0) {
132 for (
int i = 1; i < WARP_SIZE; i *= 2) {
133 T rhs = Impl::shfl_xor_sync(temp, i);
134 ReduceOperator::combine(temp, rhs);
140 for (
int i = 1; i < WARP_SIZE; i *= 2) {
141 int srcLane = threadId ^ i;
142 T rhs = Impl::shfl_sync(temp, srcLane);
144 if (srcLane < numThreads) {
145 ReduceOperator::combine(temp, rhs);
153 if (numThreads > WARP_SIZE) {
155 __shared__ T sd[MAX_WARPS];
167 if (warpId * WARP_SIZE < numThreads) {
171 temp = ReduceOperator::identity();
173 for (
int i = 1; i < WARP_SIZE; i *= 2) {
174 T rhs = Impl::shfl_xor_sync(temp, i);
175 ReduceOperator::combine(temp, rhs);
188template <
typename ReduceOperator, Int32 WarpSize,
typename T>
189ARCCORE_DEVICE
inline bool
190grid_reduce(T& val, SmallSpan<T> device_mem,
unsigned int* device_count)
192 int numBlocks = gridDim.x;
193 int numThreads = blockDim.x;
194 int wrap_around = numBlocks - 1;
195 int blockId = blockIdx.x;
196 int threadId = threadIdx.x;
198 T temp = block_reduce<ReduceOperator, WarpSize, T>(val);
201 bool lastBlock =
false;
203 device_mem[blockId] = temp;
211 unsigned int old_count = ::atomicInc(device_count, wrap_around);
212 lastBlock = ((int)old_count == wrap_around);
216 lastBlock = __syncthreads_or(lastBlock);
220 temp = ReduceOperator::identity();
222 for (
int i = threadId; i < numBlocks; i += numThreads) {
223 ReduceOperator::combine(temp, device_mem[i]);
226 temp = block_reduce<ReduceOperator, WarpSize, T>(temp);
234 return lastBlock && threadId == 0;
240template <
typename ReduceOperator>
241ARCCORE_INLINE_REDUCE ARCCORE_DEVICE
void
244 using DataType =
typename ReduceOperator::DataType;
245 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
246 unsigned int* device_count = dev_info.m_device_count;
247 DataType* host_pinned_ptr = dev_info.m_host_pinned_final_ptr;
248 DataType v = dev_info.m_current_value;
253 constexpr const Int32 WARP_SIZE = 64;
255 constexpr const Int32 WARP_SIZE = 32;
263 bool is_done = grid_reduce<ReduceOperator, WARP_SIZE, DataType>(v, grid_buffer, device_count);
265 *host_pinned_ptr = v;
std::int64_t Int64
Signed integer type of 64 bits.
std::int32_t Int32
Signed integer type of 32 bits.