Arcane  v3.16.9.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CommonCudaHipReduceImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CommonCudaHipReduceImpl.h (C) 2000-2025 */
9/* */
10/* Implémentation CUDA et HIP des réductions. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17// Ce fichier doit être inclus uniquement par 'arcane/accelerator/Reduce.h'
18// et n'est valide que compilé par le compilateur CUDA et HIP
19
20#include "arcane/accelerator/CommonCudaHipAtomicImpl.h"
21
22/*---------------------------------------------------------------------------*/
23/*---------------------------------------------------------------------------*/
24
25// Attention: avec ROCm et un GPU sur bus PCI express la plupart des
26// méthodes atomiques ne fonctionnent pas si le pointeur est allouée
27// en mémoire unifiée. A priori le problème se pose avec atomicMin, atomicMax,
28// atomicInc. Par contre atomicAdd a l'air de fonctionner.
29
30namespace Arcane::Accelerator::impl
31{
32
33__device__ __forceinline__ unsigned int getThreadId()
34{
35 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
36 (blockDim.x * blockDim.y) * threadIdx.z;
37 return threadId;
38}
39
40__device__ __forceinline__ unsigned int getBlockId()
41{
42 int blockId = blockIdx.x + blockIdx.y * gridDim.x;
43 return blockId;
44}
45
46constexpr const Int32 MAX_BLOCK_SIZE = 1024;
47
48template <typename T, enum eAtomicOperation>
50
51template <typename T>
53{
54 public:
55
56 static ARCCORE_DEVICE inline void apply(T& val, const T v)
57 {
58 val = val + v;
59 }
60};
61
62template <typename T>
64{
65 public:
66
67 static ARCCORE_DEVICE inline void apply(T& val, const T v)
68 {
69 val = v > val ? v : val;
70 }
71};
72
73template <typename T>
75{
76 public:
77
78 static ARCCORE_DEVICE inline void apply(T& val, const T v)
79 {
80 val = v < val ? v : val;
81 }
82};
83
84#if defined(__CUDACC__)
85ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
86{
87 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
88}
89
90ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
91{
92 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
93}
94
95ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
96{
97 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
98}
99
100ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
101{
102 return ::__shfl_sync(0xffffffffu, var, laneMask);
103}
104
105ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
106{
107 return ::__shfl_sync(0xffffffffu, var, laneMask);
108}
109
110ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
111{
112 return ::__shfl_sync(0xffffffffu, var, laneMask);
113}
114#endif
115#if defined(__HIP__)
116ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
117{
118 return ::__shfl_xor(var, laneMask);
119}
120
121ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
122{
123 return ::__shfl_xor(var, laneMask);
124}
125
126ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
127{
128 return ::__shfl_xor(var, laneMask);
129}
130
131ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
132{
133 return ::__shfl(var, laneMask);
134}
135
136ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
137{
138 return ::__shfl(var, laneMask);
139}
140
141ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
142{
143 return ::__shfl(var, laneMask);
144}
145#endif
146
147/*---------------------------------------------------------------------------*/
148/*---------------------------------------------------------------------------*/
149// Cette implémentation est celle de RAJA
151template <typename ReduceOperator, Int32 WarpSize, typename T>
152ARCCORE_DEVICE inline T block_reduce(T val, T identity)
153{
154 constexpr Int32 WARP_SIZE = WarpSize;
155 constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
156 int numThreads = blockDim.x * blockDim.y * blockDim.z;
157
158 int threadId = getThreadId();
159
160 int warpId = threadId % WARP_SIZE;
161 int warpNum = threadId / WARP_SIZE;
162
163 T temp = val;
164
165 if (numThreads % WARP_SIZE == 0) {
166
167 // reduce each warp
168 for (int i = 1; i < WARP_SIZE; i *= 2) {
169 T rhs = impl::shfl_xor_sync(temp, i);
170 ReduceOperator::apply(temp, rhs);
171 }
172 }
173 else {
174
175 // reduce each warp
176 for (int i = 1; i < WARP_SIZE; i *= 2) {
177 int srcLane = threadId ^ i;
178 T rhs = impl::shfl_sync(temp, srcLane);
179 // only add from threads that exist (don't double count own value)
180 if (srcLane < numThreads) {
181 ReduceOperator::apply(temp, rhs);
182 }
183 }
184 }
185
186 //printf("CONTINUE tid=%d wid=%d wnum=%d\n",threadId,warpId,warpNum);
187
188 // reduce per warp values
189 if (numThreads > WARP_SIZE) {
190
191 __shared__ T sd[MAX_WARPS];
192
193 // write per warp values to shared memory
194 if (warpId == 0) {
195 sd[warpNum] = temp;
196 }
197
198 __syncthreads();
199
200 if (warpNum == 0) {
201
202 // read per warp values
203 if (warpId * WARP_SIZE < numThreads) {
204 temp = sd[warpId];
205 }
206 else {
207 temp = identity;
208 }
209 for (int i = 1; i < WARP_SIZE; i *= 2) {
210 T rhs = impl::shfl_xor_sync(temp, i);
211 ReduceOperator::apply(temp, rhs);
212 }
213 }
214
215 __syncthreads();
216 }
217 return temp;
218}
219
220/*---------------------------------------------------------------------------*/
221/*---------------------------------------------------------------------------*/
223// returns true if put reduced value in val
224template <typename ReduceOperator, Int32 WarpSize, typename T>
225ARCCORE_DEVICE inline bool
226grid_reduce(T& val, T identity, SmallSpan<T> device_mem, unsigned int* device_count)
227{
228 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
229 int numThreads = blockDim.x * blockDim.y * blockDim.z;
230 int wrap_around = numBlocks - 1;
231
232 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
233 (gridDim.x * gridDim.y) * blockIdx.z;
234
235 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
236 (blockDim.x * blockDim.y) * threadIdx.z;
237
238 T temp = block_reduce<ReduceOperator, WarpSize>(val, identity);
239
240 // one thread per block writes to device_mem
241 bool lastBlock = false;
242 if (threadId == 0) {
243 device_mem[blockId] = temp;
244 // ensure write visible to all threadblocks
245 __threadfence();
246 // increment counter, (wraps back to zero if old count == wrap_around)
247 // Attention: avec ROCm et un GPU sur bus PCI express si 'device_count'
248 // est alloué en mémoire unifiée le atomicAdd ne fonctionne pas.
249 // Dans ce cas on peut le remplacer par:
250 // unsigned int old_count = ::atomicAdd(device_count, 1) % (wrap_around+1);
251 unsigned int old_count = ::atomicInc(device_count, wrap_around);
252 lastBlock = ((int)old_count == wrap_around);
253 }
254
255 // returns non-zero value if any thread passes in a non-zero value
256 lastBlock = __syncthreads_or(lastBlock);
257
258 // last block accumulates values from device_mem
259 if (lastBlock) {
260 temp = identity;
261
262 for (int i = threadId; i < numBlocks; i += numThreads) {
263 ReduceOperator::apply(temp, device_mem[i]);
264 }
265
266 temp = block_reduce<ReduceOperator, WarpSize>(temp, identity);
267
268 // one thread returns value
269 if (threadId == 0) {
270 val = temp;
271 }
272 }
273
274 return lastBlock && threadId == 0;
275}
276
277/*---------------------------------------------------------------------------*/
278/*---------------------------------------------------------------------------*/
279
280template <typename DataType, typename ReduceOperator>
281ARCANE_INLINE_REDUCE ARCCORE_DEVICE void _applyDeviceGeneric(const ReduceDeviceInfo<DataType>& dev_info)
282{
283 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
284 DataType identity = dev_info.m_identity;
285 unsigned int* device_count = dev_info.m_device_count;
286 DataType* ptr = dev_info.m_device_final_ptr;
287 DataType v = dev_info.m_current_value;
288#if HIP_VERSION_MAJOR >= 7
289 // A partir de ROCM 7, il n'est pas possible de savoir à la compilation
290 // la taille d'un warp. C'est 32 ou 64. Pour contourner ce problème,
291 // on utilise deux instantiations de la reduction et on choisit
292 // dynamiquement. C'est probablement un peu moins performant qu'avec
293 // l'ancien mécanisme. Une autre solution serait de choisir à la
294 // compilation la taille d'un warp. Cela est possible sur les architectures
295 // HPC comme les MI300 car cette valeur est fixe. Mais sur les architectures
296 // RDNA les deux valeurs sont possibles.
297 // TODO: Sur les architectures AMD avec une taille de warp fixe,
298 // utiliser cette taille de warp comme 'constexpr' pour éviter le 'if'.
299 const Int32 warp_size = dev_info.m_warp_size;
300#else
301#if defined(__HIP__)
302 constexpr const Int32 WARP_SIZE = warpSize;
303#else
304 constexpr const Int32 WARP_SIZE = 32;
305#endif
306#endif
307
308 //if (impl::getThreadId()==0){
309 // printf("BLOCK ID=%d %p s=%d ptr=%p %p use_grid_reduce=%d\n",
310 // getBlockId(),grid_buffer.data(),grid_buffer.size(),ptr,
311 // (void*)device_count,(do_grid_reduce)?1:0);
312 //}
313#if HIP_VERSION_MAJOR >= 7
314 bool is_done = false;
315 if (warp_size == 64)
316 is_done = grid_reduce<ReduceOperator, 64>(v, identity, grid_buffer, device_count);
317 else if (warp_size == 32)
318 is_done = grid_reduce<ReduceOperator, 32>(v, identity, grid_buffer, device_count);
319 else
320 assert("Bad warp size (should be 32 or 64)");
321#else
322 bool is_done = grid_reduce<ReduceOperator, WARP_SIZE>(v, identity, grid_buffer, device_count);
323#endif
324 if (is_done) {
325 *ptr = v;
326 // Il est important de remettre cette variable à zéro pour la prochaine utilisation d'un Reducer.
327 (*device_count) = 0;
328 }
329}
330
331/*---------------------------------------------------------------------------*/
332/*---------------------------------------------------------------------------*/
333
334template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorSum<DataType>::
336{
337 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
338 _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
339}
340
341/*---------------------------------------------------------------------------*/
342/*---------------------------------------------------------------------------*/
343
344template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMax<DataType>::
346{
347 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Max>;
348 _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
349}
350
351/*---------------------------------------------------------------------------*/
352/*---------------------------------------------------------------------------*/
353
354template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMin<DataType>::
356{
357 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
358 _applyDeviceGeneric<DataType, ReduceOperator>(dev_info);
359}
360
361/*---------------------------------------------------------------------------*/
362/*---------------------------------------------------------------------------*/
363
364} // End namespace Arcane::Accelerator::impl
365
366/*---------------------------------------------------------------------------*/
367/*---------------------------------------------------------------------------*/
368
369#endif
Informations pour effectuer une réduction sur un device.
Definition Reduce.h:92
eAtomicOperation
Type d'opération atomique supportée.
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.