Arcane  v4.1.2.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
CommonCudaHipReduceImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2025 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CommonCudaHipReduceImpl.h (C) 2000-2025 */
9/* */
10/* Implémentation CUDA et HIP des réductions. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCCORE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17// Ce fichier doit être inclus uniquement par 'arcane/accelerator/Reduce.h'
18// et n'est valide que compilé par le compilateur CUDA et HIP
19
20#include "arccore/accelerator/AcceleratorGlobal.h"
21
22/*---------------------------------------------------------------------------*/
23/*---------------------------------------------------------------------------*/
24
25// Attention: avec ROCm et un GPU sur bus PCI express la plupart des
26// méthodes atomiques ne fonctionnent pas si le pointeur est allouée
27// en mémoire unifiée. A priori le problème se pose avec atomicMin, atomicMax,
28// atomicInc. Par contre atomicAdd a l'air de fonctionner.
29
30namespace Arcane::Accelerator::Impl
31{
32
33__device__ __forceinline__ unsigned int getThreadId()
34{
35 int threadId = threadIdx.x;
36 return threadId;
37}
38
39__device__ __forceinline__ unsigned int getBlockId()
40{
41 int blockId = blockIdx.x;
42 return blockId;
43}
44
45constexpr const Int32 MAX_BLOCK_SIZE = 1024;
46
47#if defined(__CUDACC__)
48ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
49{
50 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
51}
52
53ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
54{
55 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
56}
57
58ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
59{
60 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
61}
62
63ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
64{
65 return ::__shfl_sync(0xffffffffu, var, laneMask);
66}
67
68ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
69{
70 return ::__shfl_sync(0xffffffffu, var, laneMask);
71}
72
73ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
74{
75 return ::__shfl_sync(0xffffffffu, var, laneMask);
76}
77#endif
78#if defined(__HIP__)
79ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
80{
81 return ::__shfl_xor(var, laneMask);
82}
83
84ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
85{
86 return ::__shfl_xor(var, laneMask);
87}
88
89ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
90{
91 return ::__shfl_xor(var, laneMask);
92}
93
94ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
95{
96 return ::__shfl(var, laneMask);
97}
98
99ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
100{
101 return ::__shfl(var, laneMask);
102}
103
104ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
105{
106 return ::__shfl(var, laneMask);
107}
108#endif
109
110/*---------------------------------------------------------------------------*/
111/*---------------------------------------------------------------------------*/
112// Cette implémentation est celle de RAJA
113//! reduce values in block into thread 0
114template <typename ReduceOperator, Int32 WarpSize, typename T>
115ARCCORE_DEVICE inline T block_reduce(T val)
116{
117 constexpr Int32 WARP_SIZE = WarpSize;
118 constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
119 int numThreads = blockDim.x;
120
121 int threadId = getThreadId();
122
123 int warpId = threadId % WARP_SIZE;
124 int warpNum = threadId / WARP_SIZE;
125
126 T temp = val;
127
128 if (numThreads % WARP_SIZE == 0) {
129
130 // reduce each warp
131 for (int i = 1; i < WARP_SIZE; i *= 2) {
132 T rhs = Impl::shfl_xor_sync(temp, i);
133 ReduceOperator::combine(temp, rhs);
134 }
135 }
136 else {
137
138 // reduce each warp
139 for (int i = 1; i < WARP_SIZE; i *= 2) {
140 int srcLane = threadId ^ i;
141 T rhs = Impl::shfl_sync(temp, srcLane);
142 // only add from threads that exist (don't double count own value)
143 if (srcLane < numThreads) {
144 ReduceOperator::combine(temp, rhs);
145 }
146 }
147 }
148
149 //printf("CONTINUE tid=%d wid=%d wnum=%d\n",threadId,warpId,warpNum);
150
151 // reduce per warp values
152 if (numThreads > WARP_SIZE) {
153
154 __shared__ T sd[MAX_WARPS];
155
156 // write per warp values to shared memory
157 if (warpId == 0) {
158 sd[warpNum] = temp;
159 }
160
161 __syncthreads();
162
163 if (warpNum == 0) {
164
165 // read per warp values
166 if (warpId * WARP_SIZE < numThreads) {
167 temp = sd[warpId];
168 }
169 else {
170 temp = ReduceOperator::identity();
171 }
172 for (int i = 1; i < WARP_SIZE; i *= 2) {
173 T rhs = Impl::shfl_xor_sync(temp, i);
174 ReduceOperator::combine(temp, rhs);
175 }
176 }
177
178 __syncthreads();
179 }
180 return temp;
181}
182
183/*---------------------------------------------------------------------------*/
184/*---------------------------------------------------------------------------*/
185//! reduce values in grid into thread 0 of last running block
186// returns true if put reduced value in val
187template <typename ReduceOperator, Int32 WarpSize, typename T>
188ARCCORE_DEVICE inline bool
189grid_reduce(T& val, SmallSpan<T> device_mem, unsigned int* device_count)
190{
191 int numBlocks = gridDim.x;
192 int numThreads = blockDim.x;
193 int wrap_around = numBlocks - 1;
194 int blockId = blockIdx.x;
195 int threadId = threadIdx.x;
196
197 T temp = block_reduce<ReduceOperator, WarpSize, T>(val);
198
199 // one thread per block writes to device_mem
200 bool lastBlock = false;
201 if (threadId == 0) {
202 device_mem[blockId] = temp;
203 // ensure write visible to all threadblocks
204 __threadfence();
205 // increment counter, (wraps back to zero if old count == wrap_around)
206 // Attention: avec ROCm et un GPU sur bus PCI express si 'device_count'
207 // est alloué en mémoire unifiée le atomicAdd ne fonctionne pas.
208 // Dans ce cas on peut le remplacer par:
209 // unsigned int old_count = ::atomicAdd(device_count, 1) % (wrap_around+1);
210 unsigned int old_count = ::atomicInc(device_count, wrap_around);
211 lastBlock = ((int)old_count == wrap_around);
212 }
213
214 // returns non-zero value if any thread passes in a non-zero value
215 lastBlock = __syncthreads_or(lastBlock);
216
217 // last block accumulates values from device_mem
218 if (lastBlock) {
219 temp = ReduceOperator::identity();
220
221 for (int i = threadId; i < numBlocks; i += numThreads) {
222 ReduceOperator::combine(temp, device_mem[i]);
223 }
224
225 temp = block_reduce<ReduceOperator, WarpSize, T>(temp);
226
227 // one thread returns value
228 if (threadId == 0) {
229 val = temp;
230 }
231 }
232
233 return lastBlock && threadId == 0;
234}
235
236/*---------------------------------------------------------------------------*/
237/*---------------------------------------------------------------------------*/
238
239template <typename ReduceOperator>
240ARCCORE_INLINE_REDUCE ARCCORE_DEVICE void
241_applyDeviceGeneric(const ReduceDeviceInfo<typename ReduceOperator::DataType>& dev_info)
242{
243 using DataType = typename ReduceOperator::DataType;
244 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
245 unsigned int* device_count = dev_info.m_device_count;
246 DataType* host_pinned_ptr = dev_info.m_host_pinned_final_ptr;
247 DataType v = dev_info.m_current_value;
248#if HIP_VERSION_MAJOR >= 7
249 // A partir de ROCM 7, il n'est pas possible de savoir à la compilation
250 // la taille d'un warp. C'est 32 ou 64. Pour contourner ce problème,
251 // on utilise deux instantiations de la reduction et on choisit
252 // dynamiquement. C'est probablement un peu moins performant qu'avec
253 // l'ancien mécanisme. Une autre solution serait de choisir à la
254 // compilation la taille d'un warp. Cela est possible sur les architectures
255 // HPC comme les MI300 car cette valeur est fixe. Mais sur les architectures
256 // RDNA les deux valeurs sont possibles.
257 // TODO: Sur les architectures AMD avec une taille de warp fixe,
258 // utiliser cette taille de warp comme 'constexpr' pour éviter le 'if'.
259 const Int32 warp_size = dev_info.m_warp_size;
260#else
261#if defined(__HIP__)
262 constexpr const Int32 WARP_SIZE = warpSize;
263#else
264 constexpr const Int32 WARP_SIZE = 32;
265#endif
266#endif
267
268 //if (impl::getThreadId()==0){
269 // printf("BLOCK ID=%d %p s=%d ptr=%p %p use_grid_reduce=%d\n",
270 // getBlockId(),grid_buffer.data(),grid_buffer.size(),ptr,
271 // (void*)device_count,(do_grid_reduce)?1:0);
272 //}
273#if HIP_VERSION_MAJOR >= 7
274 bool is_done = false;
275 if (warp_size == 64)
276 is_done = grid_reduce<ReduceOperator, 64, DataType>(v, grid_buffer, device_count);
277 else if (warp_size == 32)
278 is_done = grid_reduce<ReduceOperator, 32, DataType>(v, grid_buffer, device_count);
279 else
280 assert("Bad warp size (should be 32 or 64)");
281#else
282 bool is_done = grid_reduce<ReduceOperator, WARP_SIZE, DataType>(v, grid_buffer, device_count);
283#endif
284 if (is_done) {
285 *host_pinned_ptr = v;
286 // Il est important de remettre cette variable à zéro pour la prochaine utilisation d'un Reducer.
287 (*device_count) = 0;
288 }
289}
290
291/*---------------------------------------------------------------------------*/
292/*---------------------------------------------------------------------------*/
293
294} // End namespace Arcane::Accelerator::Impl
295
296/*---------------------------------------------------------------------------*/
297/*---------------------------------------------------------------------------*/
298
299#endif
std::int64_t Int64
Type entier signé sur 64 bits.
std::int32_t Int32
Type entier signé sur 32 bits.