Arcane  4.1.12.0
User documentation
Loading...
Searching...
No Matches
CommonCudaHipReduceImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CommonCudaHipReduceImpl.h (C) 2000-2026 */
9/* */
10/* CUDA and HIP implementation of reductions. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17// This file must only be included by 'arcane/accelerator/Reduce.h'
18// and is only valid when compiled by the CUDA and HIP compilers
19
20#include "arccore/accelerator/AcceleratorGlobal.h"
21
22/*---------------------------------------------------------------------------*/
23/*---------------------------------------------------------------------------*/
24
25// Warning: with ROCm and a GPU on a PCI express bus, most
26// atomic methods do not work if the pointer is allocated
27// in unified memory. The problem seems to occur with atomicMin, atomicMax,
28// atomicInc. However, atomicAdd seems to work.
29
30namespace Arcane::Accelerator::Impl
31{
32
33__device__ __forceinline__ unsigned int getThreadId()
34{
35 int threadId = threadIdx.x;
36 return threadId;
37}
38
39__device__ __forceinline__ unsigned int getBlockId()
40{
41 int blockId = blockIdx.x;
42 return blockId;
43}
44
45constexpr const Int32 MAX_BLOCK_SIZE = 1024;
46
47#if defined(__CUDACC__)
48ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
49{
50 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
51}
52
53ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
54{
55 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
56}
57
58ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
59{
60 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
61}
62
63ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
64{
65 return ::__shfl_sync(0xffffffffu, var, laneMask);
66}
67
68ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
69{
70 return ::__shfl_sync(0xffffffffu, var, laneMask);
71}
72
73ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
74{
75 return ::__shfl_sync(0xffffffffu, var, laneMask);
76}
77#endif
78#if defined(__HIP__)
79ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
80{
81 return ::__shfl_xor(var, laneMask);
82}
83
84ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
85{
86 return ::__shfl_xor(var, laneMask);
87}
88
89ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
90{
91 return ::__shfl_xor(var, laneMask);
92}
93
94ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
95{
96 return ::__shfl(var, laneMask);
97}
98
99ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
100{
101 return ::__shfl(var, laneMask);
102}
103
104ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
105{
106 return ::__shfl(var, laneMask);
107}
108#endif
109
110/*---------------------------------------------------------------------------*/
111/*---------------------------------------------------------------------------*/
112
113// This implementation is that of RAJA
114//! reduce values in block into thread 0
115template <typename ReduceOperator, Int32 WarpSize, typename T>
116ARCCORE_DEVICE inline T block_reduce(T val)
117{
118 constexpr Int32 WARP_SIZE = WarpSize;
119 constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
120 int numThreads = blockDim.x;
121
122 int threadId = getThreadId();
123
124 int warpId = threadId % WARP_SIZE;
125 int warpNum = threadId / WARP_SIZE;
126
127 T temp = val;
128
129 if (numThreads % WARP_SIZE == 0) {
130
131 // reduce each warp
132 for (int i = 1; i < WARP_SIZE; i *= 2) {
133 T rhs = Impl::shfl_xor_sync(temp, i);
134 ReduceOperator::combine(temp, rhs);
135 }
136 }
137 else {
138
139 // reduce each warp
140 for (int i = 1; i < WARP_SIZE; i *= 2) {
141 int srcLane = threadId ^ i;
142 T rhs = Impl::shfl_sync(temp, srcLane);
143 // only add from threads that exist (don't double count own value)
144 if (srcLane < numThreads) {
145 ReduceOperator::combine(temp, rhs);
146 }
147 }
148 }
149
150 //printf("CONTINUE tid=%d wid=%d wnum=%d\n",threadId,warpId,warpNum);
151
152 // reduce per warp values
153 if (numThreads > WARP_SIZE) {
154
155 __shared__ T sd[MAX_WARPS];
156
157 // write per warp values to shared memory
158 if (warpId == 0) {
159 sd[warpNum] = temp;
160 }
161
162 __syncthreads();
163
164 if (warpNum == 0) {
165
166 // read per warp values
167 if (warpId * WARP_SIZE < numThreads) {
168 temp = sd[warpId];
169 }
170 else {
171 temp = ReduceOperator::identity();
172 }
173 for (int i = 1; i < WARP_SIZE; i *= 2) {
174 T rhs = Impl::shfl_xor_sync(temp, i);
175 ReduceOperator::combine(temp, rhs);
176 }
177 }
178
179 __syncthreads();
180 }
181 return temp;
182}
183
184/*---------------------------------------------------------------------------*/
185/*---------------------------------------------------------------------------*/
186//! reduce values in grid into thread 0 of last running block
187// returns true if put reduced value in val
188template <typename ReduceOperator, Int32 WarpSize, typename T>
189ARCCORE_DEVICE inline bool
190grid_reduce(T& val, SmallSpan<T> device_mem, unsigned int* device_count)
191{
192 int numBlocks = gridDim.x;
193 int numThreads = blockDim.x;
194 int wrap_around = numBlocks - 1;
195 int blockId = blockIdx.x;
196 int threadId = threadIdx.x;
197
198 T temp = block_reduce<ReduceOperator, WarpSize, T>(val);
199
200 // one thread per block writes to device_mem
201 bool lastBlock = false;
202 if (threadId == 0) {
203 device_mem[blockId] = temp;
204 // ensure write visible to all threadblocks
205 __threadfence();
206 // increment counter, (wraps back to zero if old count == wrap_around)
207 // Warning: with ROCm and a GPU on a PCI express bus, if 'device_count'
208 // is allocated in unified memory, atomicAdd does not work.
209 // In this case, we can replace it with:
210 // unsigned int old_count = ::atomicAdd(device_count, 1) % (wrap_around+1);
211 unsigned int old_count = ::atomicInc(device_count, wrap_around);
212 lastBlock = ((int)old_count == wrap_around);
213 }
214
215 // returns non-zero value if any thread passes in a non-zero value
216 lastBlock = __syncthreads_or(lastBlock);
217
218 // last block accumulates values from device_mem
219 if (lastBlock) {
220 temp = ReduceOperator::identity();
221
222 for (int i = threadId; i < numBlocks; i += numThreads) {
223 ReduceOperator::combine(temp, device_mem[i]);
224 }
225
226 temp = block_reduce<ReduceOperator, WarpSize, T>(temp);
227
228 // one thread returns value
229 if (threadId == 0) {
230 val = temp;
231 }
232 }
233
234 return lastBlock && threadId == 0;
235}
236
237/*---------------------------------------------------------------------------*/
238/*---------------------------------------------------------------------------*/
239
240template <typename ReduceOperator>
241ARCCORE_INLINE_REDUCE ARCCORE_DEVICE void
242_applyDeviceGeneric(const ReduceDeviceInfo<typename ReduceOperator::DataType>& dev_info)
243{
244 using DataType = typename ReduceOperator::DataType;
245 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
246 unsigned int* device_count = dev_info.m_device_count;
247 DataType* host_pinned_ptr = dev_info.m_host_pinned_final_ptr;
248 DataType v = dev_info.m_current_value;
249 // With CUDA, the size of a warp is always 32.
250 // With HIP, the size of a warp is 64 for GFX9 class GPUs
251 // (MI50, MI100, ... , MI300) and 32 for GFX10 and subsequent architectures
252#if defined(__GFX9__)
253 constexpr const Int32 WARP_SIZE = 64;
254#else
255 constexpr const Int32 WARP_SIZE = 32;
256#endif
257
258 //if (impl::getThreadId()==0){
259 // printf("BLOCK ID=%d %p s=%d ptr=%p %p use_grid_reduce=%d\n",
260 // getBlockId(),grid_buffer.data(),grid_buffer.size(),ptr,
261 // (void*)device_count,(do_grid_reduce)?1:0);
262 //}
263 bool is_done = grid_reduce<ReduceOperator, WARP_SIZE, DataType>(v, grid_buffer, device_count);
264 if (is_done) {
265 *host_pinned_ptr = v;
266 // It is important to reset this variable to zero for the next use of a Reducer.
267 (*device_count) = 0;
268 }
269}
270
271/*---------------------------------------------------------------------------*/
272/*---------------------------------------------------------------------------*/
273
274} // End namespace Arcane::Accelerator::Impl
275
276/*---------------------------------------------------------------------------*/
277/*---------------------------------------------------------------------------*/
278
279#endif
std::int64_t Int64
Signed integer type of 64 bits.
std::int32_t Int32
Signed integer type of 32 bits.