14#include "arccore/accelerator_native/SyclAccelerator.h"
16#include "arcane/accelerator/core/Runner.h"
17#include "arcane/accelerator/core/RunQueue.h"
18#include "arcane/accelerator/RunCommandLoop.h"
19#include "arcane/accelerator/Reduce.h"
20#include "arcane/accelerator/Scan.h"
22#include "arcane/utils/NumArray.h"
32extern "C" int arcaneTestSycl1()
35 std::cout <<
"TEST1\n";
38 std::cout <<
"Device: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
40 int* data = sycl::malloc_shared<int>(N, q);
42 for (
int i = 0; i < N; i++)
45 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
50 for (
int i = 0; i < N; i++)
51 std::cout << data[i] << std::endl;
61extern "C" int arcaneTestSycl2()
64 std::cout <<
"TEST 2\n";
70 for (
int i = 0; i < N; i++)
74 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
79 for (
int i = 0; i < N; i++)
80 std::cout << data[i] << std::endl;
89extern "C" int arcaneTestSycl3()
92 std::cout <<
"TEST 3\n";
100 for (
int i = 0; i < N; i++)
113 for (
int i = 0; i < N; i++)
114 std::cout << data[i] << std::endl;
122extern "C" int arcaneTestSycl4()
126 constexpr Int32 BLOCK_SIZE = 128;
129 const int NB_BLOCK = 152;
130 const int N = BLOCK_SIZE * NB_BLOCK;
131 std::cout <<
"TEST 4\n";
133 sycl::device device{ sycl::gpu_selector_v };
134 Int64 mcu = device.get_info<sycl::info::device::max_compute_units>();
135 Int64 mwg = device.get_info<sycl::info::device::max_work_group_size>();
136 std::cout <<
"DEVICE mcu=" << mcu <<
" mwg=" << mwg <<
"\n";
137 sycl::queue q{ device };
143 atomic_counter.resize(1);
145 Int64 ref_total_reduce = 0;
146 for (
int i = 0; i < N; i++) {
147 data_to_reduce[i] = i;
148 ref_total_reduce += data_to_reduce[i];
151 Span<Int32> out_atomic_counter(atomic_counter.to1DSpan());
153 q.single_task([=]() {
154 out_atomic_counter[0] = 0;
158 const int nb_iter = 1;
159 for (Int32 iter = 0; iter < nb_iter; ++iter) {
161 Span<Int32> in_data_to_reduce(data_to_reduce.to1DSpan());
162 Span<Int64> inout_data_partial_reduce(data_partial_reduce.to1DSpan());
163 Int32* atomic_counter_ptr = out_atomic_counter.data();
164 q.parallel_for(sycl::nd_range<1>(N, BLOCK_SIZE), [=](sycl::nd_item<1>
id) {
165 Int32 i =
static_cast<Int32
>(
id.get_global_id());
166 const Int32 global_id =
static_cast<Int32
>(
id.get_global_id(0));
167 const Int32 local_id =
static_cast<Int32
>(
id.get_local_id(0));
168 const Int32 group_id =
static_cast<Int32
>(
id.get_group_linear_id());
169 const Int32 sub_group_id =
static_cast<Int32
>(
id.get_sub_group().get_local_id());
170 Int32 nb_block =
static_cast<Int32
>(
id.get_group_range(0));
172 inout_data1(i, 0) = global_id;
173 inout_data1(i, 1) = local_id;
174 inout_data1(i, 2) = group_id;
175 inout_data1(i, 3) = sub_group_id;
176 inout_data1(i, 5) = 0;
177 Int32 v = in_data_to_reduce[i];
179 bool is_last =
false;
180 id.barrier(sycl::access::fence_space::local_space);
182 Int32 vx = sycl::reduce_over_group(
id.get_group(), v, sycl::plus<Int32>{});
183 inout_data1(i, 0) = vx;
189 inout_data1(i, 4) = local_sum;
190 inout_data_partial_reduce[group_id] = local_sum;
191 sycl::atomic_ref<Int32, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
192 Int32 cx = a.fetch_add(1);
193 inout_data1(i, 5) = cx;
194 if (cx == (nb_block - 1))
197 id.barrier(sycl::access::fence_space::local_space);
202 for (
int x = 0; x < nb_block; ++x)
203 my_total += inout_data_partial_reduce[x];
205 inout_data_partial_reduce[0] = my_total;
206 *atomic_counter_ptr = 0;
211 Int64 kernel_total = data_partial_reduce[0];
212 std::cout <<
"N=" << N <<
" REF_TOTAL=" << ref_total_reduce <<
" computed=" << kernel_total <<
"\n";
213 bool do_verbose =
true;
215 for (
int i = 0; i < N; i++) {
218 std::cout <<
"I=" << i <<
" global_id=" << data1(i, 0)
219 <<
" local_id=" << data1(i, 1)
220 <<
" group_id=" << data1(i, 2)
221 <<
" sub_group_local_id=" << data1(i, 3)
222 <<
" v=" << data1(i, 4)
226 std::cout <<
"FINAL_N=" << N <<
" REF_TOTAL=" << ref_total_reduce <<
" computed=" << kernel_total <<
"\n";
233extern "C" void arcaneTestSycl5()
237 constexpr int N = 25;
241 for (
int i = 0; i < N; i++)
251 reducer1.add(inout_data[i]);
260extern "C" void arcaneTestSycl6()
267 constexpr int N = 4789;
277 Int64 total_exclusive = 7;
278 for (
int i = 0; i < N; i++) {
279 expected_exclusive_data[i] = total_exclusive;
282 total_exclusive += data[i];
283 expected_inclusive_data[i] = total;
286 Arcane::Accelerator::Impl::SyclScanner<false, Int64, ScannerSumOperator<Int64>> scanner;
287 scanner.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
289 const bool do_verbose = (N < 256);
290 for (
int i = 0; i < N; i++) {
291 bool is_bad = out_data[i] != expected_inclusive_data[i];
292 if (do_verbose || is_bad)
293 std::cout <<
"OUT_INCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_inclusive_data[i] <<
"\n";
297 std::cout <<
"FINAL OUT_INCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_inclusive_data[N - 1] <<
"\n";
299 Arcane::Accelerator::Impl::SyclScanner<true, Int64, ScannerSumOperator<Int64>> scanner2;
300 scanner2.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
302 for (
int i = 0; i < N; i++) {
303 bool is_bad = out_data[i] != expected_exclusive_data[i];
304 if (do_verbose || is_bad)
305 std::cout <<
"OUT_EXCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_exclusive_data[i] <<
"\n";
309 std::cout <<
"FINAL OUT_EXCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_exclusive_data[N - 1] <<
"\n";
312extern "C" void arcaneTestSycl7()
316 constexpr int N = 63;
329 Int64 total_exclusive = 7;
330 for (
int i = 0; i < N; i++) {
331 expected_exclusive_data[i] = total_exclusive;
334 total_exclusive += data[i];
335 expected_inclusive_data[i] = total;
338 Arcane::Accelerator::Impl::SyclScanner<false, Int64, ScannerSumOperator<Int64>> scanner;
339 scanner.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
341 const bool do_verbose = (N < 256);
342 for (
int i = 0; i < N; i++) {
343 bool is_bad = out_data[i] != expected_inclusive_data[i];
344 if (do_verbose || is_bad)
345 std::cout <<
"OUT_INCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_inclusive_data[i] <<
"\n";
349 std::cout <<
"FINAL OUT_INCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_inclusive_data[N - 1] <<
"\n";
351 Arcane::Accelerator::Impl::SyclScanner<true, Int64, ScannerSumOperator<Int64>> scanner2;
352 scanner2.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
354 for (
int i = 0; i < N; i++) {
355 bool is_bad = out_data[i] != expected_exclusive_data[i];
356 if (do_verbose || is_bad)
357 std::cout <<
"OUT_EXCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_exclusive_data[i] <<
"\n";
361 std::cout <<
"FINAL OUT_EXCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_exclusive_data[N - 1] <<
"\n";
#define ARCANE_FATAL(...)
Macro throwing a FatalErrorException.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
1D loop on accelerator with additional arguments.
Class to perform a 'sum' reduction.
Execution queue for an accelerator.
Execution manager for accelerator.
Base class for multi-dimensional views.
Multi-dimensional arrays for numerical types accessible on accelerators.
View of an array of elements of type T.
Namespace for accelerator usage.
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
RunQueue makeQueue(const Runner &runner)
Creates a queue associated with runner.
@ SYCL
Execution policy using the SYCL environment.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
@ Device
Allocates on the device.