Arcane  v3.14.10.0
Documentation utilisateur
Chargement...
Recherche...
Aucune correspondance
CommonCudaHipAtomicImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2023 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CommonCudaHipAtomicImpl.h (C) 2000-2023 */
9/* */
10/* Implémentation CUDA et HIP des opérations atomiques. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_ACCELERATOR_COMMONCUDHIPATOMICIMPL_H
13#define ARCANE_ACCELERATOR_COMMONCUDHIPATOMICIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17// Ce fichier doit être inclus uniquement par 'arcane/accelerator/Reduce.h'
18// et n'est valide que compilé par le compilateur CUDA et HIP
19
20/*---------------------------------------------------------------------------*/
21/*---------------------------------------------------------------------------*/
22
23// Attention: avec ROCm et un GPU sur bus PCI express la plupart des
24// méthodes atomiques ne fonctionnent pas si le pointeur est allouée
25// en mémoire unifiée. A priori le problème se pose avec atomicMin, atomicMax,
26// atomicInc. Par contre atomicAdd a l'air de fonctionner si les accès
27// concurrents ne sont pas trop nombreux
28
29/*---------------------------------------------------------------------------*/
30/*---------------------------------------------------------------------------*/
31
32namespace Arcane::Accelerator::impl
33{
34
35/*---------------------------------------------------------------------------*/
36/*---------------------------------------------------------------------------*/
37
38template <typename DataType, enum eAtomicOperation>
40
41template <typename DataType>
43template <typename DataType>
45template <typename DataType>
47
48template <>
50{
51 public:
52
53 static ARCCORE_DEVICE void apply(int* ptr, int v)
54 {
55 ::atomicAdd(ptr, v);
56 }
57};
58
59template <>
61{
62 public:
63
64 static ARCCORE_DEVICE void apply(int* ptr, int v)
65 {
66 ::atomicMax(ptr, v);
67 }
68};
69
70template <>
72{
73 public:
74
75 static ARCCORE_DEVICE void apply(int* ptr, int v)
76 {
77 ::atomicMin(ptr, v);
78 }
79};
80
81template <>
83{
84 public:
85
86 static ARCCORE_DEVICE void apply(Int64* ptr, Int64 v)
87 {
88 static_assert(sizeof(Int64) == sizeof(long long int), "Bad pointer size");
89 ::atomicAdd((unsigned long long int*)ptr, v);
90 }
91};
92
93template <>
95{
96 public:
97
98#if defined(__HIP__)
99 static ARCCORE_DEVICE void apply(Int64* ptr, Int64 v)
100 {
101 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(ptr);
102 unsigned long long int old = *address_as_ull, assumed;
103
104 do {
105 assumed = old;
106 Int64 assumed_as_int64 = static_cast<Int64>(assumed);
107 old = atomicCAS(address_as_ull, assumed,
108 static_cast<unsigned long long int>(v > assumed_as_int64 ? v : assumed_as_int64));
109 } while (assumed != old);
110 }
111#else
112 static ARCCORE_DEVICE void apply(Int64* ptr, Int64 v)
113 {
114 ::atomicMax((long long int*)ptr, v);
115 }
116#endif
117};
118
119template <>
121{
122 public:
123
124#if defined(__HIP__)
125 static ARCCORE_DEVICE void apply(Int64* ptr, Int64 v)
126 {
127 unsigned long long int* address_as_ull = reinterpret_cast<unsigned long long int*>(ptr);
128 unsigned long long int old = *address_as_ull, assumed;
129
130 do {
131 assumed = old;
132 Int64 assumed_as_int64 = static_cast<Int64>(assumed);
133 old = atomicCAS(address_as_ull, assumed,
134 static_cast<unsigned long long int>(v < assumed_as_int64 ? v : assumed_as_int64));
135 } while (assumed != old);
136 }
137#else
138 static ARCCORE_DEVICE void apply(Int64* ptr, Int64 v)
139 {
140 ::atomicMin((long long int*)ptr, v);
141 }
142#endif
143};
144
145// Les devices d'architecture inférieure à 6.0 ne supportent pas
146// les atomicAdd sur les 'double'.
147// Ce code est issu de la documentation NVIDIA (programming guide)
148__device__ inline double
149preArch60atomicAdd(double* address, double val)
150{
151 unsigned long long int* address_as_ull = (unsigned long long int*)address;
152 unsigned long long int old = *address_as_ull, assumed;
153
154 do {
155 assumed = old;
156 old = atomicCAS(address_as_ull, assumed,
157 __double_as_longlong(val +
158 __longlong_as_double(assumed)));
159 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
160 } while (assumed != old);
161
162 return __longlong_as_double(old);
163}
164__device__ inline double
165atomicMaxDouble(double* address, double val)
166{
167 unsigned long long int* address_as_ull = (unsigned long long int*)address;
168 unsigned long long int old = *address_as_ull, assumed;
169
170 do {
171 assumed = old;
172 double assumed_as_double = __longlong_as_double(assumed);
173 old = atomicCAS(address_as_ull, assumed,
174 __double_as_longlong(val > assumed_as_double ? val : assumed_as_double));
175 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
176 } while (assumed != old);
177
178 return __longlong_as_double(old);
179}
180
181__device__ inline double
182atomicMinDouble(double* address, double val)
183{
184 unsigned long long int* address_as_ull = (unsigned long long int*)address;
185 unsigned long long int old = *address_as_ull, assumed;
186
187 do {
188 assumed = old;
189 double assumed_as_double = __longlong_as_double(assumed);
190 old = atomicCAS(address_as_ull, assumed,
191 __double_as_longlong(val < assumed_as_double ? val : assumed_as_double));
192 // Note: uses integer comparison to avoid hang in case of NaN (since NaN != NaN)
193 } while (assumed != old);
194
195 return __longlong_as_double(old);
196}
197
198template <>
200{
201 public:
202
203 static ARCCORE_DEVICE void apply(double* ptr, double v)
204 {
205#if __CUDA_ARCH__ >= 600
206 ::atomicAdd(ptr, v);
207#else
208 preArch60atomicAdd(ptr, v);
209#endif
210 }
211};
212
213template <>
215{
216 public:
217
218 static ARCCORE_DEVICE void apply(double* ptr, double v)
219 {
220 atomicMaxDouble(ptr, v);
221 }
222};
223
224template <>
226{
227 public:
228
229 static ARCCORE_DEVICE void apply(double* ptr, double v)
230 {
231 atomicMinDouble(ptr, v);
232 }
233};
234
235/*---------------------------------------------------------------------------*/
236/*---------------------------------------------------------------------------*/
237
238} // namespace Arcane::Accelerator::impl
239
240/*---------------------------------------------------------------------------*/
241/*---------------------------------------------------------------------------*/
242
243#endif
eAtomicOperation
Type d'opération atomique supportée.
std::int64_t Int64
Type entier signé sur 64 bits.