Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
SimdSSE.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* SimdSSE.h (C) 2000-2016 */
9/* */
10/* Vectorization for SSE. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCANE_UTILS_SIMDSSE_H
13#define ARCANE_UTILS_SIMDSSE_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17/*
18 * This file should not be included directly.
19 * Use 'Simd.h' instead.
20 */
21
22/*---------------------------------------------------------------------------*/
23/*---------------------------------------------------------------------------*/
24
25namespace Arcane
26{
27
28/*---------------------------------------------------------------------------*/
29/*---------------------------------------------------------------------------*/
30
38class ARCANE_ALIGNAS_PACKED(16) SSESimdX4Int32
39{
40 public:
41
42 static const int BLOCK_SIZE = 4;
43 enum
44 {
45 Length = 4,
46 Alignment = 16
47 };
48
49 public:
50
51 __m128i v0;
52 SSESimdX4Int32() {}
53 SSESimdX4Int32(__m128i _v0)
54 : v0(_v0)
55 {}
56 explicit SSESimdX4Int32(Int32 a)
57 : v0(_mm_set1_epi32(a))
58 {}
59
60 private:
61
62 SSESimdX4Int32(Int32 a3, Int32 a2, Int32 a1, Int32 a0)
63 : v0(_mm_set_epi32(a3, a2, a1, a0))
64 {}
65
66 public:
67
68 SSESimdX4Int32(const Int32* base, const Int32* idx)
69 : v0(_mm_set_epi32(base[idx[3]], base[idx[2]], base[idx[1]], base[idx[0]]))
70 {}
71 // TODO: implement the unaligned version
72 explicit SSESimdX4Int32(const Int32* base)
73 : v0(_mm_load_si128((const __m128i*)base))
74 {}
75
76 Int32 operator[](Integer i) const { return ((const Int32*)&v0)[i]; }
77 Int32& operator[](Integer i) { return ((Int32*)&v0)[i]; }
78
79 void set(ARCANE_RESTRICT Int32* base, const ARCANE_RESTRICT Int32* idx) const
80 {
81 const Int32* x = (const Int32*)(this);
82 base[idx[0]] = x[0];
83 base[idx[1]] = x[1];
84 base[idx[2]] = x[2];
85 base[idx[3]] = x[3];
86 }
87
88 void set(ARCANE_RESTRICT Int32* base) const
89 {
90 // TODO: implement the unaligned version
91 _mm_store_si128((__m128i*)base, v0);
92 }
93
94 static SSESimdX4Int32 fromScalar(Int32 a0, Int32 a1, Int32 a2, Int32 a3)
95 {
96 return SSESimdX4Int32(a3, a2, a1, a0);
97 }
98
99 private:
100
101 void operator=(Int32 _v);
102};
103
104/*---------------------------------------------------------------------------*/
105/*---------------------------------------------------------------------------*/
106
114class ARCANE_ALIGNAS_PACKED(16) SSESimdX2Real
115{
116 public:
117
118 static const int BLOCK_SIZE = 2;
119 enum
120 {
121 Length = 2
122 };
123 typedef EMULSimdX2Int32 Int32IndexType;
124
125 public:
126
127 __m128d v0;
128 SSESimdX2Real() {}
129 SSESimdX2Real(__m128d _v0)
130 : v0(_v0)
131 {}
132 explicit SSESimdX2Real(Real r)
133 : v0(_mm_set1_pd(r))
134 {}
135
136 private:
137
138 SSESimdX2Real(Real a1, Real a0)
139 : v0(_mm_set_pd(a1, a0))
140 {}
141
142 public:
143
144 SSESimdX2Real(const Real* base, const Int32* idx)
145 : v0(_mm_set_pd(base[idx[1]], base[idx[0]]))
146 {}
147 SSESimdX2Real(const Real* base, const Int32IndexType* simd_idx)
148 {
149 const Int32* idx = (const Int32*)simd_idx;
150 v0 = _mm_set_pd(base[idx[1]], base[idx[0]]);
151 }
152 SSESimdX2Real(const Real* base, const Int32IndexType& simd_idx)
153 {
154 const Int32* idx = (const Int32*)&simd_idx;
155 v0 = _mm_set_pd(base[idx[1]], base[idx[0]]);
156 }
157 SSESimdX2Real(const Real* base)
158 {
159 v0 = _mm_load_pd(base);
160 }
161
162 Real operator[](Integer i) const { return ((const Real*)&v0)[i]; }
163 Real& operator[](Integer i) { return ((Real*)&v0)[i]; }
164
165 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32* idx) const
166 {
167 const Real* x = (const Real*)(this);
168 base[idx[0]] = x[0];
169 base[idx[1]] = x[1];
170 }
171
172 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32IndexType& simd_idx) const
173 {
174 this->set(base, &simd_idx);
175 }
176
177 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32IndexType* simd_idx) const
178 {
179 const Int32* idx = (const ARCANE_RESTRICT Int32*)simd_idx;
180 const Real* x = (const Real*)(this);
181 base[idx[0]] = x[0];
182 base[idx[1]] = x[1];
183 }
184
185 void set(ARCANE_RESTRICT Real* base) const
186 {
187 _mm_store_pd(base, v0);
188 }
189
190 static SSESimdX2Real fromScalar(Real a0, Real a1)
191 {
192 return SSESimdX2Real(a1, a0);
193 }
194
195 // Unary operation operator-
196 inline SSESimdX2Real operator-() const
197 {
198 return SSESimdX2Real(_mm_sub_pd(_mm_setzero_pd(), v0));
199 }
200
201 private:
202
203 void operator=(Real _v);
204};
205
206/*---------------------------------------------------------------------------*/
207/*---------------------------------------------------------------------------*/
208
216class ARCANE_ALIGNAS_PACKED(16) SSESimdX4Real
217{
218 public:
219
220 static const int BLOCK_SIZE = 4;
221 enum
222 {
223 Length = 4
224 };
225 // NOTE: using EMULSimd instead of SSE is much more performant
226 // with gcc 4.9 and gcc 6.1. With Intel 16, it is the opposite but the
227 // difference is not huge.
228 // typedef SSESimdX4Int32 Int32IndexType;
229 typedef EMULSimdX4Int32 Int32IndexType;
230
231 public:
232
233 __m128d v0;
234 __m128d v1;
235 SSESimdX4Real() {}
236 SSESimdX4Real(__m128d _v0, __m128d _v1)
237 : v0(_v0)
238 , v1(_v1)
239 {}
240 explicit SSESimdX4Real(Real r)
241 : v0(_mm_set1_pd(r))
242 , v1(_mm_set1_pd(r))
243 {}
244
245 private:
246
247 SSESimdX4Real(Real a3, Real a2, Real a1, Real a0)
248 : v0(_mm_set_pd(a1, a0))
249 , v1(_mm_set_pd(a3, a2))
250 {}
251
252 public:
253
254 SSESimdX4Real(const Real* base, const Int32* idx)
255 : v0(_mm_set_pd(base[idx[1]], base[idx[0]]))
256 , v1(_mm_set_pd(base[idx[3]], base[idx[2]]))
257 {}
258
259 SSESimdX4Real(const Real* base, const Int32IndexType* simd_idx)
260 : SSESimdX4Real(base, (const Int32*)simd_idx)
261 {}
262
263 SSESimdX4Real(const Real* base, const Int32IndexType& simd_idx)
264 : SSESimdX4Real(base, (const Int32*)&simd_idx)
265 {}
266
267 SSESimdX4Real(const Real* base)
268 : v0(_mm_load_pd(base))
269 , v1(_mm_load_pd(base + 2))
270 {}
271
272 Real operator[](Integer i) const { return ((const Real*)&v0)[i]; }
273 Real& operator[](Integer i) { return ((Real*)&v0)[i]; }
274
275 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32* idx) const
276 {
277 const Real* x = (const Real*)(this);
278 base[idx[0]] = x[0];
279 base[idx[1]] = x[1];
280 base[idx[2]] = x[2];
281 base[idx[3]] = x[3];
282 }
283
284 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32IndexType& simd_idx) const
285 {
286 this->set(base, (const Int32*)&simd_idx);
287 }
288
289 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32IndexType* simd_idx) const
290 {
291 this->set(base, (const Int32*)simd_idx);
292 }
293
294 void set(ARCANE_RESTRICT Real* base) const
295 {
296 _mm_store_pd(base, v0);
297 _mm_store_pd(base + 2, v1);
298 }
299
300 static SSESimdX4Real fromScalar(Real a0, Real a1, Real a2, Real a3)
301 {
302 return SSESimdX4Real(a3, a2, a1, a0);
303 }
304
305 // Unary operation operator-
306 inline SSESimdX4Real operator-() const
307 {
308 return SSESimdX4Real(_mm_sub_pd(_mm_setzero_pd(), v0),
309 _mm_sub_pd(_mm_setzero_pd(), v1));
310 }
311
312 private:
313
314 void operator=(Real _v);
315};
316
317/*---------------------------------------------------------------------------*/
318/*---------------------------------------------------------------------------*/
319
324class ARCANE_ALIGNAS_PACKED(64) SSESimdX8Real
325{
326 public:
327
328 static const int BLOCK_SIZE = 8;
329 enum
330 {
331 Length = 8
332 };
333
334 public:
335
336 __m128d v0;
337 __m128d v1;
338 __m128d v2;
339 __m128d v3;
340 SSESimdX8Real() {}
341 SSESimdX8Real(__m128d _v0, __m128d _v1, __m128d _v2, __m128d _v3)
342 : v0(_v0)
343 , v1(_v1)
344 , v2(_v2)
345 , v3(_v3)
346 {}
347 explicit SSESimdX8Real(Real r)
348 : v0(_mm_set1_pd(r))
349 , v1(_mm_set1_pd(r))
350 , v2(_mm_set1_pd(r))
351 , v3(_mm_set1_pd(r))
352 {}
353
354 private:
355
356 SSESimdX8Real(Real a7, Real a6, Real a5, Real a4, Real a3, Real a2, Real a1, Real a0)
357 : v0(_mm_set_pd(a1, a0))
358 , v1(_mm_set_pd(a3, a2))
359 , v2(_mm_set_pd(a5, a4))
360 , v3(_mm_set_pd(a7, a6))
361 {}
362
363 public:
364
365 SSESimdX8Real(const Real* base, const Int32* idx)
366 {
367 v0 = _mm_set_pd(base[idx[1]], base[idx[0]]);
368 v1 = _mm_set_pd(base[idx[3]], base[idx[2]]);
369 v2 = _mm_set_pd(base[idx[5]], base[idx[4]]);
370 v3 = _mm_set_pd(base[idx[7]], base[idx[6]]);
371 }
372
373 Real operator[](Integer i) const { return ((const Real*)&v0)[i]; }
374 Real& operator[](Integer i) { return ((Real*)&v0)[i]; }
375
376 void set(ARCANE_RESTRICT Real* base, const ARCANE_RESTRICT Int32* idx) const
377 {
378 const Real* x = (const Real*)(this);
379 base[idx[0]] = x[0];
380 base[idx[1]] = x[1];
381 base[idx[2]] = x[2];
382 base[idx[3]] = x[3];
383 base[idx[4]] = x[4];
384 base[idx[5]] = x[5];
385 base[idx[6]] = x[6];
386 base[idx[7]] = x[7];
387 }
388
389 static SSESimdX8Real fromScalar(Real a0, Real a1, Real a2, Real a3, Real a4, Real a5, Real a6, Real a7)
390 {
391 return SSESimdX8Real(a7, a6, a5, a4, a3, a2, a1, a0);
392 }
393
394 // Unary operation operator-
395 inline SSESimdX8Real operator-() const
396 {
397 return SSESimdX8Real(_mm_sub_pd(_mm_setzero_pd(), v0),
398 _mm_sub_pd(_mm_setzero_pd(), v1),
399 _mm_sub_pd(_mm_setzero_pd(), v2),
400 _mm_sub_pd(_mm_setzero_pd(), v3));
401 }
402
403 private:
404
405 void operator=(Real _v);
406};
407
408/*---------------------------------------------------------------------------*/
409/*---------------------------------------------------------------------------*/
410
421
422/*---------------------------------------------------------------------------*/
423/*---------------------------------------------------------------------------*/
424
426{
427 public:
428
429 static const char* name() { return "SSE"; }
430 enum
431 {
432 Int32IndexSize = SSESimdReal::Length
433 };
434 typedef SSESimdReal SimdReal;
435 typedef SSESimdReal::Int32IndexType SimdInt32IndexType;
436};
437
438/*---------------------------------------------------------------------------*/
439/*---------------------------------------------------------------------------*/
440
441ARCANE_UTILS_EXPORT std::ostream&
442operator<<(std::ostream& o, const SSESimdReal& s);
443
444/*---------------------------------------------------------------------------*/
445/*---------------------------------------------------------------------------*/
446
447} // namespace Arcane
448
449/*---------------------------------------------------------------------------*/
450/*---------------------------------------------------------------------------*/
451
452#endif
#define ARCANE_ALIGNAS_PACKED(value)
Macro to guarantee the packing and alignment of a class to value bytes.
Integer vectorization using emulation.
Definition SimdEMUL.h:36
Integer vectorization using emulation.
Definition SimdEMUL.h:107
Vectorization of reals using SSE.
Definition SimdSSE.h:217
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
Int32 Integer
Type representing an integer.
double Real
Type representing a real number.
SSESimdX4Real SSESimdReal
Vector of 'double' in SSE implementation.
Definition SimdSSE.h:420
std::int32_t Int32
Signed integer type of 32 bits.