Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
Test.sycl.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Test.sycl.cc (C) 2000-2026 */
9/* */
10/* File containing tests for the SYCL implementation. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/accelerator_native/SyclAccelerator.h"
15
16#include "arcane/accelerator/core/Runner.h"
17#include "arcane/accelerator/core/RunQueue.h"
18#include "arcane/accelerator/RunCommandLoop.h"
19#include "arcane/accelerator/Reduce.h"
20#include "arcane/accelerator/Scan.h"
21
22#include "arcane/utils/NumArray.h"
23
24using namespace Arccore;
25using namespace Arcane;
26using namespace Arcane::Accelerator;
27
28/*---------------------------------------------------------------------------*/
29/*---------------------------------------------------------------------------*/
30
31// Test Pure SYCL Call
32extern "C" int arcaneTestSycl1()
33{
34 const int N = 8;
35 std::cout << "TEST1\n";
36
37 sycl::queue q;
38 std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
39
40 int* data = sycl::malloc_shared<int>(N, q);
41
42 for (int i = 0; i < N; i++)
43 data[i] = i;
44
45 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
46 data[i] *= 2;
47 })
48 .wait();
49
50 for (int i = 0; i < N; i++)
51 std::cout << data[i] << std::endl;
52 sycl::free(data, q);
53
54 return 0;
55}
56
57/*---------------------------------------------------------------------------*/
58/*---------------------------------------------------------------------------*/
59
60// Same as Test1 with NumArray
61extern "C" int arcaneTestSycl2()
62{
63 const int N = 8;
64 std::cout << "TEST 2\n";
65
66 sycl::queue q;
67
69
70 for (int i = 0; i < N; i++)
71 data[i] = i;
72
73 Span<Int32> inout_data(data.to1DSpan());
74 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
75 inout_data[i] *= 3;
76 })
77 .wait();
78
79 for (int i = 0; i < N; i++)
80 std::cout << data[i] << std::endl;
81
82 return 0;
83}
84
85/*---------------------------------------------------------------------------*/
86/*---------------------------------------------------------------------------*/
87
88// Same as Test1 with NumArray
89extern "C" int arcaneTestSycl3()
90{
91 const int N = 12;
92 std::cout << "TEST 3\n";
93
94 Runner runner_sycl(eExecutionPolicy::SYCL);
95 RunQueue queue{ makeQueue(runner_sycl) };
96 sycl::queue q;
97
99
100 for (int i = 0; i < N; i++)
101 data[i] = i;
102
103 {
104 auto command = makeCommand(queue);
105 Span<Int32> inout_data(data.to1DSpan());
106 command << RUNCOMMAND_LOOP1(iter, N)
107 {
108 auto [i] = iter();
109 inout_data[i] *= 4;
110 };
111 }
112
113 for (int i = 0; i < N; i++)
114 std::cout << data[i] << std::endl;
115
116 return 0;
117}
118
119/*---------------------------------------------------------------------------*/
120/*---------------------------------------------------------------------------*/
121
122extern "C" int arcaneTestSycl4()
123{
124 // device.get_info<cl::sycl::info::device::max_work_group_size>();
125 //constexpr Int32 WARP_SIZE = 32;
126 constexpr Int32 BLOCK_SIZE = 128;
127
128 //const int nb_block = 152 * 15 * 12;
129 const int NB_BLOCK = 152;
130 const int N = BLOCK_SIZE * NB_BLOCK;
131 std::cout << "TEST 4\n";
132
133 sycl::device device{ sycl::gpu_selector_v };
134 Int64 mcu = device.get_info<sycl::info::device::max_compute_units>();
135 Int64 mwg = device.get_info<sycl::info::device::max_work_group_size>();
136 std::cout << "DEVICE mcu=" << mcu << " mwg=" << mwg << "\n";
137 sycl::queue q{ device };
138
139 NumArray<Int32, MDDim2> data1(N, 6);
140 NumArray<Int32, MDDim1> data_to_reduce(N);
141 NumArray<Int64, MDDim1> data_partial_reduce(NB_BLOCK);
143 atomic_counter.resize(1);
144
145 Int64 ref_total_reduce = 0;
146 for (int i = 0; i < N; i++) {
147 data_to_reduce[i] = i;
148 ref_total_reduce += data_to_reduce[i];
149 }
150
151 Span<Int32> out_atomic_counter(atomic_counter.to1DSpan());
152 {
153 q.single_task([=]() {
154 out_atomic_counter[0] = 0;
155 });
156 }
157
158 const int nb_iter = 1;
159 for (Int32 iter = 0; iter < nb_iter; ++iter) {
160 MDSpan<Int32, MDDim2> inout_data1(data1.mdspan());
161 Span<Int32> in_data_to_reduce(data_to_reduce.to1DSpan());
162 Span<Int64> inout_data_partial_reduce(data_partial_reduce.to1DSpan());
163 Int32* atomic_counter_ptr = out_atomic_counter.data();
164 q.parallel_for(sycl::nd_range<1>(N, BLOCK_SIZE), [=](sycl::nd_item<1> id) {
165 Int32 i = static_cast<Int32>(id.get_global_id());
166 const Int32 global_id = static_cast<Int32>(id.get_global_id(0));
167 const Int32 local_id = static_cast<Int32>(id.get_local_id(0));
168 const Int32 group_id = static_cast<Int32>(id.get_group_linear_id());
169 const Int32 sub_group_id = static_cast<Int32>(id.get_sub_group().get_local_id());
170 Int32 nb_block = static_cast<Int32>(id.get_group_range(0));
171 //Int32 nb_thread = static_cast<Int32>(id.get_local_range(0));
172 inout_data1(i, 0) = global_id;
173 inout_data1(i, 1) = local_id;
174 inout_data1(i, 2) = group_id;
175 inout_data1(i, 3) = sub_group_id;
176 inout_data1(i, 5) = 0;
177 Int32 v = in_data_to_reduce[i];
178 Int32 local_sum = 0;
179 bool is_last = false;
180 id.barrier(sycl::access::fence_space::local_space);
181 //Int32 v2_bis = id.get_sub_group().shuffle_down(v,1);
182 Int32 vx = sycl::reduce_over_group(id.get_group(), v, sycl::plus<Int32>{});
183 inout_data1(i, 0) = vx;
184 if (local_id == 0) {
185 //Int32 base = global_id;
186 //for (Int32 x = 0; x < nb_thread; ++x)
187 //local_sum += in_data_to_reduce[x + base];
188 local_sum = vx;
189 inout_data1(i, 4) = local_sum;
190 inout_data_partial_reduce[group_id] = local_sum;
191 sycl::atomic_ref<Int32, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
192 Int32 cx = a.fetch_add(1);
193 inout_data1(i, 5) = cx;
194 if (cx == (nb_block - 1))
195 is_last = true;
196 }
197 id.barrier(sycl::access::fence_space::local_space);
198 // I am the last one to perform the reduction.
199 // Calculate the final reduction
200 if (is_last) {
201 Int64 my_total = 0;
202 for (int x = 0; x < nb_block; ++x)
203 my_total += inout_data_partial_reduce[x];
204 // Put the final result in the first element of the array.
205 inout_data_partial_reduce[0] = my_total;
206 *atomic_counter_ptr = 0;
207 }
208 })
209 .wait();
210 }
211 Int64 kernel_total = data_partial_reduce[0];
212 std::cout << "N=" << N << " REF_TOTAL=" << ref_total_reduce << " computed=" << kernel_total << "\n";
213 bool do_verbose = true;
214 if (do_verbose) {
215 for (int i = 0; i < N; i++) {
216 Int32 imod = i % 32;
217 if (imod < 2)
218 std::cout << "I=" << i << " global_id=" << data1(i, 0)
219 << " local_id=" << data1(i, 1)
220 << " group_id=" << data1(i, 2)
221 << " sub_group_local_id=" << data1(i, 3)
222 << " v=" << data1(i, 4)
223 << std::endl;
224 }
225 }
226 std::cout << "FINAL_N=" << N << " REF_TOTAL=" << ref_total_reduce << " computed=" << kernel_total << "\n";
227 return 0;
228}
229
230/*---------------------------------------------------------------------------*/
231/*---------------------------------------------------------------------------*/
232
233extern "C" void arcaneTestSycl5()
234{
236 RunQueue queue{ makeQueue(runner) };
237 constexpr int N = 25;
238
240
241 for (int i = 0; i < N; i++)
242 data[i] = i;
243
244 {
245 auto command = makeCommand(queue);
246 Span<Int32> inout_data(data.to1DSpan());
247 ReducerSum<Int64> reducer1(command);
248 command << RUNCOMMAND_LOOP1(iter, N)
249 {
250 auto [i] = iter();
251 reducer1.add(inout_data[i]);
252 inout_data[i] *= 4;
253 };
254 }
255}
256
257/*---------------------------------------------------------------------------*/
258/*---------------------------------------------------------------------------*/
259
260extern "C" void arcaneTestSycl6()
261{
263 RunQueue queue{ makeQueue(runner) };
264 //constexpr int N = 63;
265 //constexpr int N = 139;
266 //constexpr int N = 256;
267 constexpr int N = 4789;
268 // A TESTER
269 //constexpr int N = 16900;
270 //constexpr int N = 1000000;
272 NumArray<Int64, MDDim1> out_data(N);
273 NumArray<Int64, MDDim1> expected_inclusive_data(N);
274 NumArray<Int64, MDDim1> expected_exclusive_data(N);
275
276 Int64 total = 0;
277 Int64 total_exclusive = 7;
278 for (int i = 0; i < N; i++) {
279 expected_exclusive_data[i] = total_exclusive;
280 data[i] = (i + 2);
281 total += data[i];
282 total_exclusive += data[i];
283 expected_inclusive_data[i] = total;
284 }
285
286 Arcane::Accelerator::Impl::SyclScanner<false, Int64, ScannerSumOperator<Int64>> scanner;
287 scanner.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
288
289 const bool do_verbose = (N < 256);
290 for (int i = 0; i < N; i++) {
291 bool is_bad = out_data[i] != expected_inclusive_data[i];
292 if (do_verbose || is_bad)
293 std::cout << "OUT_INCL=" << i << " v=" << out_data[i] << " expected=" << expected_inclusive_data[i] << "\n";
294 if (is_bad)
295 ARCANE_FATAL("Bad value");
296 }
297 std::cout << "FINAL OUT_INCL=" << (N - 1) << " v=" << out_data[N - 1] << " expected=" << expected_inclusive_data[N - 1] << "\n";
298
299 Arcane::Accelerator::Impl::SyclScanner<true, Int64, ScannerSumOperator<Int64>> scanner2;
300 scanner2.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
301
302 for (int i = 0; i < N; i++) {
303 bool is_bad = out_data[i] != expected_exclusive_data[i];
304 if (do_verbose || is_bad)
305 std::cout << "OUT_EXCL=" << i << " v=" << out_data[i] << " expected=" << expected_exclusive_data[i] << "\n";
306 if (is_bad)
307 ARCANE_FATAL("Bad value");
308 }
309 std::cout << "FINAL OUT_EXCL=" << (N - 1) << " v=" << out_data[N - 1] << " expected=" << expected_exclusive_data[N - 1] << "\n";
310}
311
312extern "C" void arcaneTestSycl7()
313{
315 RunQueue queue{ makeQueue(runner) };
316 constexpr int N = 63;
317 //constexpr int N = 139;
318 //constexpr int N = 256;
319 //constexpr int N = 4789;
320 // A TESTER
321 //constexpr int N = 16900;
322 //constexpr int N = 1000000;
324 NumArray<Int64, MDDim1> out_data(N);
325 NumArray<Int64, MDDim1> expected_inclusive_data(N);
326 NumArray<Int64, MDDim1> expected_exclusive_data(N);
327
328 Int64 total = 0;
329 Int64 total_exclusive = 7;
330 for (int i = 0; i < N; i++) {
331 expected_exclusive_data[i] = total_exclusive;
332 data[i] = (i + 2);
333 total += data[i];
334 total_exclusive += data[i];
335 expected_inclusive_data[i] = total;
336 }
337
338 Arcane::Accelerator::Impl::SyclScanner<false, Int64, ScannerSumOperator<Int64>> scanner;
339 scanner.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
340
341 const bool do_verbose = (N < 256);
342 for (int i = 0; i < N; i++) {
343 bool is_bad = out_data[i] != expected_inclusive_data[i];
344 if (do_verbose || is_bad)
345 std::cout << "OUT_INCL=" << i << " v=" << out_data[i] << " expected=" << expected_inclusive_data[i] << "\n";
346 if (is_bad)
347 ARCANE_FATAL("Bad value");
348 }
349 std::cout << "FINAL OUT_INCL=" << (N - 1) << " v=" << out_data[N - 1] << " expected=" << expected_inclusive_data[N - 1] << "\n";
350
351 Arcane::Accelerator::Impl::SyclScanner<true, Int64, ScannerSumOperator<Int64>> scanner2;
352 scanner2.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
353
354 for (int i = 0; i < N; i++) {
355 bool is_bad = out_data[i] != expected_exclusive_data[i];
356 if (do_verbose || is_bad)
357 std::cout << "OUT_EXCL=" << i << " v=" << out_data[i] << " expected=" << expected_exclusive_data[i] << "\n";
358 if (is_bad)
359 ARCANE_FATAL("Bad value");
360 }
361 std::cout << "FINAL OUT_EXCL=" << (N - 1) << " v=" << out_data[N - 1] << " expected=" << expected_exclusive_data[N - 1] << "\n";
362}
#define ARCANE_FATAL(...)
Macro throwing a FatalErrorException.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
1D loop on accelerator with additional arguments.
Base class for multi-dimensional views.
Multi-dimensional arrays for numerical types accessible on accelerators.
View of an array of elements of type T.
Definition Span.h:635
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
RunQueue makeQueue(const Runner &runner)
Creates a queue associated with runner.
@ SYCL
Execution policy using the SYCL environment.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
@ Device
Allocates on the device.
Namespace of Arccore.