Arcane  v3.14.10.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
CommonCudaHipReduceImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CommonCudaHipReduceImpl.h (C) 2000-2024 */
9/* */
10/* Implémentation CUDA et HIP des réductions. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
13#define ARCANE_ACCELERATOR_COMMONCUDHIPAREDUCEIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17// Ce fichier doit être inclus uniquement par 'arcane/accelerator/Reduce.h'
18// et n'est valide que compilé par le compilateur CUDA et HIP
19
20#include "arcane/accelerator/CommonCudaHipAtomicImpl.h"
21
22/*---------------------------------------------------------------------------*/
23/*---------------------------------------------------------------------------*/
24
25// Attention: avec ROCm et un GPU sur bus PCI express la plupart des
26// méthodes atomiques ne fonctionnent pas si le pointeur est allouée
27// en mémoire unifiée. A priori le problème se pose avec atomicMin, atomicMax,
28// atomicInc. Par contre atomicAdd a l'air de fonctionner.
29
30namespace Arcane::Accelerator::impl
31{
32
33__device__ __forceinline__ unsigned int getThreadId()
34{
35 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
36 (blockDim.x * blockDim.y) * threadIdx.z;
37 return threadId;
38}
39
40__device__ __forceinline__ unsigned int getBlockId()
41{
42 int blockId = blockIdx.x + blockIdx.y * gridDim.x;
43 return blockId;
44}
45
46#if defined(__HIP__)
47constexpr const Int32 WARP_SIZE = warpSize;
48#else
49constexpr const Int32 WARP_SIZE = 32;
50#endif
51constexpr const Int32 MAX_BLOCK_SIZE = 1024;
52constexpr const Int32 MAX_WARPS = MAX_BLOCK_SIZE / WARP_SIZE;
53
54template <typename T, enum eAtomicOperation>
56
57template <typename T>
59{
60 public:
61
62 static ARCCORE_DEVICE inline void apply(T& val, const T v)
63 {
64 val = val + v;
65 }
66};
67
68template <typename T>
70{
71 public:
72
73 static ARCCORE_DEVICE inline void apply(T& val, const T v)
74 {
75 val = v > val ? v : val;
76 }
77};
78
79template <typename T>
81{
82 public:
83
84 static ARCCORE_DEVICE inline void apply(T& val, const T v)
85 {
86 val = v < val ? v : val;
87 }
88};
89
90#if defined(__CUDACC__)
91ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
92{
93 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
94}
95
96ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
97{
98 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
99}
100
101ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
102{
103 return ::__shfl_xor_sync(0xffffffffu, var, laneMask);
104}
105
106ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
107{
108 return ::__shfl_sync(0xffffffffu, var, laneMask);
109}
110
111ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
112{
113 return ::__shfl_sync(0xffffffffu, var, laneMask);
114}
115
116ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
117{
118 return ::__shfl_sync(0xffffffffu, var, laneMask);
119}
120#endif
121#if defined(__HIP__)
122ARCCORE_DEVICE inline double shfl_xor_sync(double var, int laneMask)
123{
124 return ::__shfl_xor(var, laneMask);
125}
126
127ARCCORE_DEVICE inline int shfl_xor_sync(int var, int laneMask)
128{
129 return ::__shfl_xor(var, laneMask);
130}
131
132ARCCORE_DEVICE inline Int64 shfl_xor_sync(Int64 var, int laneMask)
133{
134 return ::__shfl_xor(var, laneMask);
135}
136
137ARCCORE_DEVICE inline double shfl_sync(double var, int laneMask)
138{
139 return ::__shfl(var, laneMask);
140}
141
142ARCCORE_DEVICE inline int shfl_sync(int var, int laneMask)
143{
144 return ::__shfl(var, laneMask);
145}
146
147ARCCORE_DEVICE inline Int64 shfl_sync(Int64 var, int laneMask)
148{
149 return ::__shfl(var, laneMask);
150}
151#endif
152
153/*---------------------------------------------------------------------------*/
154/*---------------------------------------------------------------------------*/
155// Cette implémentation est celle de RAJA
156//! reduce values in block into thread 0
157template <typename ReduceOperator,typename T>
158ARCCORE_DEVICE inline T block_reduce(T val, T identity)
159{
160 int numThreads = blockDim.x * blockDim.y * blockDim.z;
161
162 int threadId = getThreadId();
163
164 int warpId = threadId % WARP_SIZE;
165 int warpNum = threadId / WARP_SIZE;
166
167 T temp = val;
168
169 if (numThreads % WARP_SIZE == 0) {
170
171 // reduce each warp
172 for (int i = 1; i < WARP_SIZE; i *= 2) {
173 T rhs = impl::shfl_xor_sync(temp, i);
174 ReduceOperator::apply(temp, rhs);
175 }
176
177 } else {
178
179 // reduce each warp
180 for (int i = 1; i < WARP_SIZE; i *= 2) {
181 int srcLane = threadId ^ i;
182 T rhs = impl::shfl_sync(temp, srcLane);
183 // only add from threads that exist (don't double count own value)
184 if (srcLane < numThreads) {
185 ReduceOperator::apply(temp, rhs);
186 }
187 }
188 }
189
190 //printf("CONTINUE tid=%d wid=%d wnum=%d\n",threadId,warpId,warpNum);
191
192 // reduce per warp values
193 if (numThreads > WARP_SIZE) {
194
195 __shared__ T sd[MAX_WARPS];
196
197 // write per warp values to shared memory
198 if (warpId == 0) {
199 sd[warpNum] = temp;
200 }
201
202 __syncthreads();
203
204 if (warpNum == 0) {
205
206 // read per warp values
207 if (warpId * WARP_SIZE < numThreads) {
208 temp = sd[warpId];
209 } else {
210 temp = identity;
211 }
212 for (int i = 1; i < WARP_SIZE; i *= 2) {
213 T rhs = impl::shfl_xor_sync(temp, i);
214 ReduceOperator::apply(temp, rhs);
215 }
216 }
217
218 __syncthreads();
219 }
220 return temp;
221}
222
223/*---------------------------------------------------------------------------*/
224/*---------------------------------------------------------------------------*/
225//! reduce values in grid into thread 0 of last running block
226// returns true if put reduced value in val
227template <typename ReduceOperator, typename T>
228ARCCORE_DEVICE inline bool
229grid_reduce(T& val,T identity,SmallSpan<T> device_mem,unsigned int* device_count)
230{
231 int numBlocks = gridDim.x * gridDim.y * gridDim.z;
232 int numThreads = blockDim.x * blockDim.y * blockDim.z;
233 int wrap_around = numBlocks - 1;
234
235 int blockId = blockIdx.x + gridDim.x * blockIdx.y +
236 (gridDim.x * gridDim.y) * blockIdx.z;
237
238 int threadId = threadIdx.x + blockDim.x * threadIdx.y +
239 (blockDim.x * blockDim.y) * threadIdx.z;
240
241 T temp = block_reduce<ReduceOperator>(val, identity);
242
243 // one thread per block writes to device_mem
244 bool lastBlock = false;
245 if (threadId == 0) {
246 device_mem[blockId] = temp;
247 // ensure write visible to all threadblocks
248 __threadfence();
249 // increment counter, (wraps back to zero if old count == wrap_around)
250 // Attention: avec ROCm et un GPU sur bus PCI express si 'device_count'
251 // est alloué en mémoire unifiée le atomicAdd ne fonctionne pas.
252 // Dans ce cas on peut le remplacer par:
253 // unsigned int old_count = ::atomicAdd(device_count, 1) % (wrap_around+1);
254 unsigned int old_count = ::atomicInc(device_count, wrap_around);
255 lastBlock = ((int)old_count == wrap_around);
256 }
257
258 // returns non-zero value if any thread passes in a non-zero value
259 lastBlock = __syncthreads_or(lastBlock);
260
261 // last block accumulates values from device_mem
262 if (lastBlock) {
263 temp = identity;
264
265 for (int i = threadId; i < numBlocks; i += numThreads) {
266 ReduceOperator::apply(temp, device_mem[i]);
267 }
268
269 temp = block_reduce<ReduceOperator>(temp, identity);
270
271 // one thread returns value
272 if (threadId == 0) {
273 val = temp;
274 }
275 }
276
277 return lastBlock && threadId == 0;
278}
279
280/*---------------------------------------------------------------------------*/
281/*---------------------------------------------------------------------------*/
282
283template<typename DataType,typename ReduceOperator,typename AtomicReduceOperator>
284ARCANE_INLINE_REDUCE ARCCORE_DEVICE
285void _applyDeviceGeneric(const ReduceDeviceInfo<DataType>& dev_info)
286{
287 SmallSpan<DataType> grid_buffer = dev_info.m_grid_buffer;
288 DataType identity = dev_info.m_identity;
289 unsigned int* device_count = dev_info.m_device_count;
290 DataType* ptr = dev_info.m_device_final_ptr;
291 DataType v = dev_info.m_current_value;
292 bool do_grid_reduce = dev_info.m_use_grid_reduce;
293
294 //if (impl::getThreadId()==0){
295 // printf("BLOCK ID=%d %p s=%d ptr=%p %p use_grid_reduce=%d\n",
296 // getBlockId(),grid_buffer.data(),grid_buffer.size(),ptr,
297 // (void*)device_count,(do_grid_reduce)?1:0);
298 //}
299 if (do_grid_reduce){
300 bool is_done = grid_reduce<ReduceOperator>(v,identity,grid_buffer,device_count);
301 if (is_done){
302 *ptr = v;
303 // Il est important de remettre cette à zéro pour la prochaine utilisation d'un Reducer.
304 (*device_count) = 0;
305 }
306 }
307 else{
308 DataType rv = impl::block_reduce<ReduceOperator>(v,identity);
309 if (impl::getThreadId()==0){
310 AtomicReduceOperator::apply(ptr,rv);
311 }
312 }
313}
314
315/*---------------------------------------------------------------------------*/
316/*---------------------------------------------------------------------------*/
317
318template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
319void ReduceFunctorSum<DataType>::
320_applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
321{
322 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Add>;
323 using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType, eAtomicOperation::Add>;
324 _applyDeviceGeneric<DataType, ReduceOperator, AtomicReduceOperator>(dev_info);
325}
326
327/*---------------------------------------------------------------------------*/
328/*---------------------------------------------------------------------------*/
329
330template<typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
331void ReduceFunctorMax<DataType>::
332_applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
333{
334 using ReduceOperator = impl::SimpleReduceOperator<DataType,eAtomicOperation::Max>;
335 using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType,eAtomicOperation::Max>;
336 _applyDeviceGeneric<DataType,ReduceOperator,AtomicReduceOperator>(dev_info);
337}
338
339/*---------------------------------------------------------------------------*/
340/*---------------------------------------------------------------------------*/
341
342template <typename DataType> ARCANE_INLINE_REDUCE ARCCORE_DEVICE
343void ReduceFunctorMin<DataType>::
344_applyDevice(const ReduceDeviceInfo<DataType>& dev_info)
345{
346 using ReduceOperator = impl::SimpleReduceOperator<DataType, eAtomicOperation::Min>;
347 using AtomicReduceOperator = impl::CommonCudaHipAtomic<DataType, eAtomicOperation::Min>;
348 _applyDeviceGeneric<DataType, ReduceOperator, AtomicReduceOperator>(dev_info);
349}
350
351/*---------------------------------------------------------------------------*/
352/*---------------------------------------------------------------------------*/
353
354} // End namespace Arcane::Accelerator::impl
355
356/*---------------------------------------------------------------------------*/
357/*---------------------------------------------------------------------------*/
358
359#endif
eAtomicOperation
Type d'opération atomique supportée.
std::int64_t Int64
Type entier signé sur 64 bits.