Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
SimdAVX.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* SimdAVX.h (C) 2000-2016 */
9/* */
10/* Vectorization for AVX and AVX2. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_UTILS_SIMDAVX_H
13#define ARCANE_UTILS_SIMDAVX_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17/*
18 * This file should not be included directly.
19 * Use 'Simd.h' instead.
20 */
21
22/*---------------------------------------------------------------------------*/
23/*---------------------------------------------------------------------------*/
24
26// #define ARCANE_USE_AVX2_GATHER
27
28// Gather is only available with AVX2
29#ifndef __AVX2__
30#undef ARCANE_USE_AVX2_GATHER
31#endif
32
33/*---------------------------------------------------------------------------*/
34/*---------------------------------------------------------------------------*/
35
36namespace Arcane
37{
38
39/*---------------------------------------------------------------------------*/
40/*---------------------------------------------------------------------------*/
41
46class ARCANE_ALIGNAS_PACKED(32) AVXSimdX8Int32
47{
48 public:
49
50 static const int BLOCK_SIZE = 8;
51 enum
52 {
53 Length = 8,
54 Alignment = 32
55 };
56
57 public:
58
59 __m256i v0;
60 AVXSimdX8Int32() {}
61 AVXSimdX8Int32(__m256i _v0)
62 : v0(_v0)
63 {}
64 explicit AVXSimdX8Int32(Int32 a)
65 : v0(_mm256_set1_epi32(a))
66 {}
67
68 private:
69
70 AVXSimdX8Int32(Int32 a7, Int32 a6, Int32 a5, Int32 a4, Int32 a3, Int32 a2, Int32 a1, Int32 a0)
71 : v0(_mm256_set_epi32(a7, a6, a5, a4, a3, a2, a1, a0))
72 {}
73
74 public:
75
76 AVXSimdX8Int32(const Int32* base, const Int32* idx)
77 : v0(_mm256_set_epi32(base[idx[7]], base[idx[6]], base[idx[5]], base[idx[4]],
78 base[idx[3]], base[idx[2]], base[idx[1]], base[idx[0]]))
79 {}
80 explicit AVXSimdX8Int32(const Int32* base)
81 : v0(_mm256_load_si256((const __m256i*)base))
82 {}
83
84 Int32 operator[](Integer i) const { return ((const Int32*)&v0)[i]; }
85 Int32& operator[](Integer i) { return ((Int32*)&v0)[i]; }
86
87 void set(ARCANE_RESTRICT Int32* base, const ARCANE_RESTRICT Int32* idx) const
88 {
89 const Int32* x = (const Int32*)(this);
90 base[idx[0]] = x[0];
91 base[idx[1]] = x[1];
92 base[idx[2]] = x[2];
93 base[idx[3]] = x[3];
94 base[idx[4]] = x[4];
95 base[idx[5]] = x[5];
96 base[idx[6]] = x[6];
97 base[idx[7]] = x[7];
98 }
99
100 void set(ARCANE_RESTRICT Int32* base) const
101 {
102 _mm256_store_si256((__m256i*)base, v0);
103 }
104
105 void load(const AVXSimdX8Int32* base)
106 {
107 v0 = _mm256_load_si256((const __m256i*)base);
108 }
109
110 static AVXSimdX8Int32 fromScalar(Int32 a0, Int32 a1, Int32 a2, Int32 a3,
111 Int32 a4, Int32 a5, Int32 a6, Int32 a7)
112 {
113 return AVXSimdX8Int32(a7, a6, a5, a4, a3, a2, a1, a0);
114 }
115
116 private:
117
118 void operator=(Int32 _v);
119};
120
121/*---------------------------------------------------------------------------*/
122/*---------------------------------------------------------------------------*/
123
129class ARCANE_ALIGNAS_PACKED(32) AVXSimdX4Real
130{
131 public:
132
133 static const int BLOCK_SIZE = 4;
134 enum
135 {
136 Length = 4
137 };
138 typedef SSESimdX4Int32 Int32IndexType;
139
140 public:
141
142 __m256d v0;
143 AVXSimdX4Real() {}
144 AVXSimdX4Real(__m256d _v0)
145 : v0(_v0)
146 {}
147 explicit AVXSimdX4Real(Real r)
148 : v0(_mm256_set1_pd(r))
149 {}
150
151 private:
152
153 AVXSimdX4Real(Real a3, Real a2, Real a1, Real a0)
154 : v0(_mm256_set_pd(a3, a2, a1, a0))
155 {}
156
157 public:
158
159 AVXSimdX4Real(const Real* base, const Int32* idx)
160 : v0(_mm256_set_pd(base[idx[3]], base[idx[2]], base[idx[1]], base[idx[0]]))
161 {}
162
163 AVXSimdX4Real(const Real* base, const Int32IndexType& simd_idx)
164#ifdef ARCANE_USE_AVX2_GATHER
165 : v0(_mm256_i32gather_pd(base, simd_idx.v0, 8)){}
166#else
167 : AVXSimdX4Real(base, (const Int32*)&simd_idx)
168 {
169 }
170#endif
171
172 AVXSimdX4Real(const Real* base, const Int32IndexType* simd_idx)
173#ifdef ARCANE_USE_AVX2_GATHER
174 : v0(_mm256_i32gather_pd((Real*)base, simd_idx->v0, 8)){}
175#else
176 : AVXSimdX4Real(base, (const Int32*)simd_idx)
177 {
178 }
179#endif
180
182 explicit AVXSimdX4Real(const Real* base)
183 : v0(_mm256_load_pd(base))
184 {}
185
186 Real operator[](Integer i) const { return ((const Real*)&v0)[i]; }
187 Real& operator[](Integer i) { return ((Real*)&v0)[i]; }
188
189 void set(ARCANE_RESTRICT Real* base, const Int32* idx) const
190 {
191#if 1
192 const Real* x = (const Real*)(this);
193 base[idx[0]] = x[0];
194 base[idx[1]] = x[1];
195 base[idx[2]] = x[2];
196 base[idx[3]] = x[3];
197#else
198 // These scatter methods are only available
199 // for AVX512VL
200 __m128i idx0 = _mm_load_si128((__m128i*)idx);
201 _mm256_i32scatter_pd(base, idx0, v0, 8);
202#endif
203 }
204
205 void set(ARCANE_RESTRICT Real* base, const Int32IndexType& simd_idx) const
206 {
207 this->set(base, &simd_idx);
208 }
209
210 void set(ARCANE_RESTRICT Real* base, const Int32IndexType* simd_idx) const
211 {
212 this->set(base, (const Int32*)simd_idx);
213 }
214
216 void set(ARCANE_RESTRICT Real* base) const
217 {
218 _mm256_store_pd(base, v0);
219 }
220
221 static AVXSimdX4Real fromScalar(Real a0, Real a1, Real a2, Real a3)
222 {
223 return AVXSimdX4Real(a3, a2, a1, a0);
224 }
225
226 // Unary operation operator-
227 inline AVXSimdX4Real operator-() const
228 {
229 return AVXSimdX4Real(_mm256_sub_pd(_mm256_setzero_pd(), v0));
230 }
231
232 private:
233
234 void operator=(Real _v);
235};
236
237/*---------------------------------------------------------------------------*/
238/*---------------------------------------------------------------------------*/
239
245class ARCANE_ALIGNAS_PACKED(64) AVXSimdX8Real
246{
247 public:
248
249 static const int BLOCK_SIZE = 8;
250 enum
251 {
252 Length = 8
253 };
254
255 public:
256
257 __m256d v0;
258 __m256d v1;
259 AVXSimdX8Real() {}
260 AVXSimdX8Real(__m256d _v0, __m256d _v1)
261 : v0(_v0)
262 , v1(_v1)
263 {}
264 explicit AVXSimdX8Real(Real r)
265 {
266 v0 = _mm256_set1_pd(r);
267 v1 = _mm256_set1_pd(r);
268 }
269
270 private:
271
272 AVXSimdX8Real(Real a7, Real a6, Real a5, Real a4, Real a3, Real a2, Real a1, Real a0)
273 {
274 v0 = _mm256_set_pd(a3, a2, a1, a0);
275 v1 = _mm256_set_pd(a7, a6, a5, a4);
276 }
277
278 public:
279
280 AVXSimdX8Real(const Real* base, const Int32* idx)
281 {
282 //TODO With AVX2, use vgather but for now we don't detect it
283 // and tests show that it is not always the most
284 // performant (maybe with aligned indices).
285#if 1
286 v0 = _mm256_set_pd(base[idx[3]], base[idx[2]], base[idx[1]], base[idx[0]]);
287 v1 = _mm256_set_pd(base[idx[7]], base[idx[6]], base[idx[5]], base[idx[4]]);
288#else
289 __m128i idx0 = _mm_loadu_si128((__m128i*)idx);
290 __m128i idx1 = _mm_loadu_si128((__m128i*)(idx + 4));
291 v0 = _mm256_i32gather_pd((Real*)base, idx0, 8);
292 v1 = _mm256_i32gather_pd((Real*)base, idx1, 8);
293#endif
294 }
295
297 explicit AVXSimdX8Real(const Real* base)
298 {
299 v0 = _mm256_load_pd(base);
300 v1 = _mm256_load_pd(base + 4);
301 }
302
303 Real operator[](Integer i) const { return ((const Real*)&v0)[i]; }
304 Real& operator[](Integer i) { return ((Real*)&v0)[i]; }
305
306 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32* idx) const
307 {
308#if 1
309 const Real* x = (const Real*)(this);
310 base[idx[0]] = x[0];
311 base[idx[1]] = x[1];
312 base[idx[2]] = x[2];
313 base[idx[3]] = x[3];
314 base[idx[4]] = x[4];
315 base[idx[5]] = x[5];
316 base[idx[6]] = x[6];
317 base[idx[7]] = x[7];
318#else
319 // These scatter methods are only available
320 // for AVX512VL
321 __m128i idx0 = _mm_loadu_si128((__m128i*)idx);
322 __m128i idx1 = _mm_loadu_si128((__m128i*)(idx + 4));
323 _mm256_i32scatter_pd(base, idx0, v0, 8);
324 _mm256_i32scatter_pd(base, idx1, v1, 8);
325#endif
326 }
327
329 void set(ARCANE_RESTRICT Real* base) const
330 {
331 _mm256_store_pd(base, v0);
332 _mm256_store_pd(base + 4, v1);
333 }
334
335 static AVXSimdX8Real fromScalar(Real a0, Real a1, Real a2, Real a3,
336 Real a4, Real a5, Real a6, Real a7)
337 {
338 return AVXSimdX8Real(a7, a6, a5, a4, a3, a2, a1, a0);
339 }
340
341 // Unary operation operator-
342 inline AVXSimdX8Real operator-() const
343 {
344 return AVXSimdX8Real(_mm256_sub_pd(_mm256_setzero_pd(), v0),
345 _mm256_sub_pd(_mm256_setzero_pd(), v1));
346 }
347
348 private:
349
350 void operator=(Real _v);
351};
352
353/*---------------------------------------------------------------------------*/
354/*---------------------------------------------------------------------------*/
355
366
367/*---------------------------------------------------------------------------*/
368/*---------------------------------------------------------------------------*/
369
370/*---------------------------------------------------------------------------*/
371/*---------------------------------------------------------------------------*/
372
374{
375 public:
376
377 static const char* name() { return "AVX"; }
378 enum
379 {
380 Int32IndexSize = AVXSimdReal::Length
381 };
382 typedef AVXSimdReal SimdReal;
383 typedef AVXSimdReal::Int32IndexType SimdInt32IndexType;
384};
385
386/*---------------------------------------------------------------------------*/
387/*---------------------------------------------------------------------------*/
388
389ARCANE_UTILS_EXPORT std::ostream& operator<<(std::ostream& o, const AVXSimdReal& s);
390
391/*---------------------------------------------------------------------------*/
392/*---------------------------------------------------------------------------*/
393
394} // namespace Arcane
395
396/*---------------------------------------------------------------------------*/
397/*---------------------------------------------------------------------------*/
398
399#endif
#define ARCANE_ALIGNAS_PACKED(value)
Macro to guarantee the packing and alignment of a class to value bytes.
Vectorization of real numbers using AVX.
Definition SimdAVX.h:130
AVXSimdX4Real(const Real *base)
Loads contiguous values located at address base, which must be aligned.
Definition SimdAVX.h:182
void set(ARCANE_RESTRICT Real *base) const
Stores the instance values at address base, which must be aligned.
Definition SimdAVX.h:216
Vectorization of real numbers using AVX with blocks of 8 reals.
Definition SimdAVX.h:246
void set(ARCANE_RESTRICT Real *base) const
Stores the instance values at address base, which must be aligned.
Definition SimdAVX.h:329
AVXSimdX8Real(const Real *base)
Loads contiguous values located at address base, which must be aligned.
Definition SimdAVX.h:297
Vectorization of integers using SSE.
Definition SimdSSE.h:39
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
AVXSimdX4Real AVXSimdReal
Vector of 'double' in SSE implementation.
Definition SimdAVX.h:365
Int32 Integer
Type representing an integer.
double Real
Type representing a real number.
std::int32_t Int32
Signed integer type of 32 bits.