12#ifndef ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
20#include "arcane/accelerator/CommonCudaHipAtomicImpl.h"
30namespace Arcane::Accelerator::impl
33__device__ __forceinline__
unsigned int getThreadId()
35 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
36 (blockDim.x * blockDim.y) * threadIdx.z;
40__device__ __forceinline__
unsigned int getBlockId()
42 int blockId = blockIdx.x + blockIdx.y * gridDim.x;
47constexpr const Int32 WARP_SIZE = warpSize;
49constexpr const Int32 WARP_SIZE = 32;
51constexpr const Int32 MAX_BLOCK_SIZE = 1024;
52constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
54template <
typename T, enum eAtomicOperation>
62 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
73 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
75 val = v > val ? v : val;
84 static ARCCORE_DEVICE
inline void apply(T& val,
const T v)
86 val = v < val ? v : val;
90#if defined(__CUDACC__)
91ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
93 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
96ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
98 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
101ARCCORE_DEVICE
inline Int64 shfl_xor_sync(
Int64 var,
int laneMask)
103 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
106ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
108 return ::__shfl_sync(0xffffffffu, var, laneMask);
111ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
113 return ::__shfl_sync(0xffffffffu, var, laneMask);
116ARCCORE_DEVICE
inline Int64 shfl_sync(Int64 var,
int laneMask)
118 return ::__shfl_sync(0xffffffffu, var, laneMask);
122ARCCORE_DEVICE
inline double shfl_xor_sync(
double var,
int laneMask)
124 return ::__shfl_xor(var, laneMask);
127ARCCORE_DEVICE
inline int shfl_xor_sync(
int var,
int laneMask)
129 return ::__shfl_xor(var, laneMask);
132ARCCORE_DEVICE
inline Int64 shfl_xor_sync(Int64 var,
int laneMask)
134 return ::__shfl_xor(var, laneMask);
137ARCCORE_DEVICE
inline double shfl_sync(
double var,
int laneMask)
139 return ::__shfl(var, laneMask);
142ARCCORE_DEVICE
inline int shfl_sync(
int var,
int laneMask)
144 return ::__shfl(var, laneMask);
147ARCCORE_DEVICE
inline Int64 shfl_sync(Int64 var,
int laneMask)
149 return ::__shfl(var, laneMask);
157template <
typename ReduceOperator,
typename T>
158ARCCORE_DEVICE
inline T block_reduce(T val, T identity)
160 int numThreads = blockDim.x * blockDim.y * blockDim.z;
162 int threadId = getThreadId();
164 int warpId = threadId % WARP_SIZE;
165 int warpNum = threadId / WARP_SIZE;
169 if (numThreads % WARP_SIZE == 0) {
172 for (
int i = 1; i < WARP_SIZE; i *= 2) {
173 T rhs = impl::shfl_xor_sync(temp, i);
174 ReduceOperator::apply(temp, rhs);
180 for (
int i = 1; i < WARP_SIZE; i *= 2) {
181 int srcLane = threadId ^ i;
182 T rhs = impl::shfl_sync(temp, srcLane);
184 if (srcLane < numThreads) {
185 ReduceOperator::apply(temp, rhs);
193 if (numThreads > WARP_SIZE) {
195 __shared__ T sd[MAX_WARPS];
207 if (warpId * WARP_SIZE < numThreads) {
212 for (
int i = 1; i < WARP_SIZE; i *= 2) {
213 T rhs = impl::shfl_xor_sync(temp, i);
214 ReduceOperator::apply(temp, rhs);
227template <
typename ReduceOperator,
typename T>
228ARCCORE_DEVICE
inline bool
229grid_reduce(T& val,T identity,SmallSpan<T> device_mem,
unsigned int* device_count)
231 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
232 int numThreads = blockDim.x * blockDim.y * blockDim.z;
233 int wrap_around = numBlocks - 1;
235 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
236 (gridDim.x * gridDim.y) * blockIdx.z;
238 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
239 (blockDim.x * blockDim.y) * threadIdx.z;
241 T temp = block_reduce<ReduceOperator>(val, identity);
244 bool lastBlock =
false;
246 device_mem[blockId] = temp;
254 unsigned int old_count = ::atomicInc(device_count, wrap_around);
255 lastBlock = ((int)old_count == wrap_around);
259 lastBlock = __syncthreads_or(lastBlock);
265 for (
int i = threadId; i < numBlocks; i += numThreads) {
266 ReduceOperator::apply(temp, device_mem[i]);
269 temp = block_reduce<ReduceOperator>(temp, identity);
277 return lastBlock && threadId == 0;
283template<
typename DataType,
typename ReduceOperator,
typename AtomicReduceOperator>
284ARCANE_INLINE_REDUCE ARCCORE_DEVICE
285void _applyDeviceGeneric(
const ReduceDeviceInfo<DataType>& dev_info)
287 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
288 DataType identity = dev_info.m_identity;
289 unsigned int* device_count = dev_info.m_device_count;
290 DataType* ptr = dev_info.m_device_final_ptr;
291 DataType v = dev_info.m_current_value;
292 bool do_grid_reduce = dev_info.m_use_grid_reduce;
300 bool is_done = grid_reduce<ReduceOperator>(v,identity,grid_buffer,device_count);
308 DataType rv = impl::block_reduce<ReduceOperator>(v,identity);
309 if (impl::getThreadId()==0){
310 AtomicReduceOperator::apply(ptr,rv);
318template <
typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
319void ReduceFunctorSum<DataType>::
320_applyDevice(
const ReduceDeviceInfo<DataType>& dev_info)
322 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
323 using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType, eAtomicOperation::Add>;
324 _applyDeviceGeneric<DataType, ReduceOperator, AtomicReduceOperator>(dev_info);
330template<
typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
331void ReduceFunctorMax<DataType>::
332_applyDevice(
const ReduceDeviceInfo<DataType>& dev_info)
334 using ReduceOperator = impl::SimpleReduceOperator<DataType,eAtomicOperation::Max>;
335 using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType,eAtomicOperation::Max>;
336 _applyDeviceGeneric<DataType,ReduceOperator,AtomicReduceOperator>(dev_info);
342template <
typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
343void ReduceFunctorMin<DataType>::
344_applyDevice(
const ReduceDeviceInfo<DataType>& dev_info)
346 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
347 using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType, eAtomicOperation::Min>;
348 _applyDeviceGeneric<DataType, ReduceOperator, AtomicReduceOperator>(dev_info);
eAtomicOperation
Type d'opération atomique supportée.
std::int64_t Int64
Type entier signé sur 64 bits.