Arcane  v4.1.2.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
CommonCudaHipReduceImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CommonCudaHipReduceImpl.h (C) 2000-2025 */
9/* */
10/* Implémentation CUDA et HIP des réductions. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17// Ce fichier doit être inclus uniquement par 'arcane/accelerator/Reduce.h'
18// et n'est valide que compilé par le compilateur CUDA et HIP
19
20#include "arccore/accelerator/CommonCudaHipAtomicImpl.h"
21
22/*---------------------------------------------------------------------------*/
23/*---------------------------------------------------------------------------*/
24
25// Attention: avec ROCm et un GPU sur bus PCI express la plupart des
26// méthodes atomiques ne fonctionnent pas si le pointeur est allouée
27// en mémoire unifiée. A priori le problème se pose avec atomicMin, atomicMax,
28// atomicInc. Par contre atomicAdd a l'air de fonctionner.
29
30namespace Arcane::Accelerator::impl
31{
32
33__device__ __forceinline__ unsigned int getThreadId()
34{
35 int threadId = threadIdx.x;
36 return threadId;
37}
38
39__device__ __forceinline__ unsigned int getBlockId()
40{
41 int blockId = blockIdx.x;
42 return blockId;
43}
44
45constexpr const Int32 MAX_BLOCK_SIZE = 1024;
46
47template <typename T, enum eAtomicOperation>
49
50template <typename T>
52{
53 public:
54
55 static ARCCORE_DEVICE inline void apply(T& val, const T v)
56 {
57 val = val + v;
58 }
59};
60
61template <typename T>
63{
64 public:
65
66 static ARCCORE_DEVICE inline void apply(T& val, const T v)
67 {
68 val = v > val ? v : val;
69 }
70};
71
72template <typename T>
74{
75 public:
76
77 static ARCCORE_DEVICE inline void apply(T& val, const T v)
78 {
79 val = v < val ? v : val;
80 }
81};
82
83#if defined(__CUDACC__)
84ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
85{
86 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
87}
88
89ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
90{
91 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
92}
93
94ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
95{
96 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
97}
98
99ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
100{
101 return ::__shfl_sync(0xffffffffu, var, laneMask);
102}
103
104ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
105{
106 return ::__shfl_sync(0xffffffffu, var, laneMask);
107}
108
109ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
110{
111 return ::__shfl_sync(0xffffffffu, var, laneMask);
112}
113#endif
114#if defined(__HIP__)
115ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
116{
117 return ::__shfl_xor(var, laneMask);
118}
119
120ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
121{
122 return ::__shfl_xor(var, laneMask);
123}
124
125ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
126{
127 return ::__shfl_xor(var, laneMask);
128}
129
130ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
131{
132 return ::__shfl(var, laneMask);
133}
134
135ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
136{
137 return ::__shfl(var, laneMask);
138}
139
140ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
141{
142 return ::__shfl(var, laneMask);
143}
144#endif
145
146/*---------------------------------------------------------------------------*/
147/*---------------------------------------------------------------------------*/
148// Cette implémentation est celle de RAJA
149//! reduce values in block into thread 0
150template <typename ReduceOperator, Int32 WarpSize, typename T, T identity>
151ARCCORE_DEVICE inline T block_reduce(T val)
152{
153 constexpr Int32 WARP_SIZE = WarpSize;
154 constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
155 int numThreads = blockDim.x;
156
157 int threadId = getThreadId();
158
159 int warpId = threadId % WARP_SIZE;
160 int warpNum = threadId / WARP_SIZE;
161
162 T temp = val;
163
164 if (numThreads % WARP_SIZE == 0) {
165
166 // reduce each warp
167 for (int i = 1; i < WARP_SIZE; i *= 2) {
168 T rhs = impl::shfl_xor_sync(temp, i);
169 ReduceOperator::apply(temp, rhs);
170 }
171 }
172 else {
173
174 // reduce each warp
175 for (int i = 1; i < WARP_SIZE; i *= 2) {
176 int srcLane = threadId ^ i;
177 T rhs = impl::shfl_sync(temp, srcLane);
178 // only add from threads that exist (don't double count own value)
179 if (srcLane < numThreads) {
180 ReduceOperator::apply(temp, rhs);
181 }
182 }
183 }
184
185 //printf("CONTINUE tid=%d wid=%d wnum=%d\n",threadId,warpId,warpNum);
186
187 // reduce per warp values
188 if (numThreads > WARP_SIZE) {
189
190 __shared__ T sd[MAX_WARPS];
191
192 // write per warp values to shared memory
193 if (warpId == 0) {
194 sd[warpNum] = temp;
195 }
196
197 __syncthreads();
198
199 if (warpNum == 0) {
200
201 // read per warp values
202 if (warpId * WARP_SIZE < numThreads) {
203 temp = sd[warpId];
204 }
205 else {
206 temp = identity;
207 }
208 for (int i = 1; i < WARP_SIZE; i *= 2) {
209 T rhs = impl::shfl_xor_sync(temp, i);
210 ReduceOperator::apply(temp, rhs);
211 }
212 }
213
214 __syncthreads();
215 }
216 return temp;
217}
218
219/*---------------------------------------------------------------------------*/
220/*---------------------------------------------------------------------------*/
221//! reduce values in grid into thread 0 of last running block
222// returns true if put reduced value in val
223template <typename ReduceOperator, Int32 WarpSize, typename T, T identity>
224ARCCORE_DEVICE inline bool
225grid_reduce(T& val, SmallSpan<T> device_mem, unsigned int* device_count)
226{
227 int numBlocks = gridDim.x;
228 int numThreads = blockDim.x;
229 int wrap_around = numBlocks - 1;
230 int blockId = blockIdx.x;
231 int threadId = threadIdx.x;
232
233 T temp = block_reduce<ReduceOperator, WarpSize, T, identity>(val);
234
235 // one thread per block writes to device_mem
236 bool lastBlock = false;
237 if (threadId == 0) {
238 device_mem[blockId] = temp;
239 // ensure write visible to all threadblocks
240 __threadfence();
241 // increment counter, (wraps back to zero if old count == wrap_around)
242 // Attention: avec ROCm et un GPU sur bus PCI express si 'device_count'
243 // est alloué en mémoire unifiée le atomicAdd ne fonctionne pas.
244 // Dans ce cas on peut le remplacer par:
245 // unsigned int old_count = ::atomicAdd(device_count, 1) % (wrap_around+1);
246 unsigned int old_count = ::atomicInc(device_count, wrap_around);
247 lastBlock = ((int)old_count == wrap_around);
248 }
249
250 // returns non-zero value if any thread passes in a non-zero value
251 lastBlock = __syncthreads_or(lastBlock);
252
253 // last block accumulates values from device_mem
254 if (lastBlock) {
255 temp = identity;
256
257 for (int i = threadId; i < numBlocks; i += numThreads) {
258 ReduceOperator::apply(temp, device_mem[i]);
259 }
260
261 temp = block_reduce<ReduceOperator, WarpSize, T, identity>(temp);
262
263 // one thread returns value
264 if (threadId == 0) {
265 val = temp;
266 }
267 }
268
269 return lastBlock && threadId == 0;
270}
271
272/*---------------------------------------------------------------------------*/
273/*---------------------------------------------------------------------------*/
274
275template <typename DataType, typename ReduceOperator, DataType Identity>
276ARCCORE_INLINE_REDUCE ARCCORE_DEVICE void
277_applyDeviceGeneric(const ReduceDeviceInfo<DataType>& dev_info)
278{
279 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
280 unsigned int* device_count = dev_info.m_device_count;
281 DataType* host_pinned_ptr = dev_info.m_host_pinned_final_ptr;
282 DataType v = dev_info.m_current_value;
283#if HIP_VERSION_MAJOR >= 7
284 // A partir de ROCM 7, il n'est pas possible de savoir à la compilation
285 // la taille d'un warp. C'est 32 ou 64. Pour contourner ce problème,
286 // on utilise deux instantiations de la reduction et on choisit
287 // dynamiquement. C'est probablement un peu moins performant qu'avec
288 // l'ancien mécanisme. Une autre solution serait de choisir à la
289 // compilation la taille d'un warp. Cela est possible sur les architectures
290 // HPC comme les MI300 car cette valeur est fixe. Mais sur les architectures
291 // RDNA les deux valeurs sont possibles.
292 // TODO: Sur les architectures AMD avec une taille de warp fixe,
293 // utiliser cette taille de warp comme 'constexpr' pour éviter le 'if'.
294 const Int32 warp_size = dev_info.m_warp_size;
295#else
296#if defined(__HIP__)
297 constexpr const Int32 WARP_SIZE = warpSize;
298#else
299 constexpr const Int32 WARP_SIZE = 32;
300#endif
301#endif
302
303 //if (impl::getThreadId()==0){
304 // printf("BLOCK ID=%d %p s=%d ptr=%p %p use_grid_reduce=%d\n",
305 // getBlockId(),grid_buffer.data(),grid_buffer.size(),ptr,
306 // (void*)device_count,(do_grid_reduce)?1:0);
307 //}
308#if HIP_VERSION_MAJOR >= 7
309 bool is_done = false;
310 if (warp_size == 64)
311 is_done = grid_reduce<ReduceOperator, 64, DataType, Identity>(v, grid_buffer, device_count);
312 else if (warp_size == 32)
313 is_done = grid_reduce<ReduceOperator, 32, DataType, Identity>(v, grid_buffer, device_count);
314 else
315 assert("Bad warp size (should be 32 or 64)");
316#else
317 bool is_done = grid_reduce<ReduceOperator, WARP_SIZE, DataType, Identity>(v, grid_buffer, device_count);
318#endif
319 if (is_done) {
320 *host_pinned_ptr = v;
321 // Il est important de remettre cette variable à zéro pour la prochaine utilisation d'un Reducer.
322 (*device_count) = 0;
323 }
324}
325
326/*---------------------------------------------------------------------------*/
327/*---------------------------------------------------------------------------*/
328
329template <typename DataType> ARCCORE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorSum<DataType>::
331{
332 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
333 _applyDeviceGeneric<DataType, ReduceOperator, identity()>(dev_info);
334}
335
336/*---------------------------------------------------------------------------*/
337/*---------------------------------------------------------------------------*/
338
339template <typename DataType> ARCCORE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMax<DataType>::
341{
342 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Max>;
343 _applyDeviceGeneric<DataType, ReduceOperator, identity()>(dev_info);
344}
345
346/*---------------------------------------------------------------------------*/
347/*---------------------------------------------------------------------------*/
348
349template <typename DataType> ARCCORE_INLINE_REDUCE ARCCORE_DEVICE void ReduceFunctorMin<DataType>::
351{
352 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
353 _applyDeviceGeneric<DataType, ReduceOperator, identity()>(dev_info);
354}
355
356/*---------------------------------------------------------------------------*/
357/*---------------------------------------------------------------------------*/
358
359} // End namespace Arcane::Accelerator::impl
360
361/*---------------------------------------------------------------------------*/
362/*---------------------------------------------------------------------------*/
363
364#endif
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.