14#include "arccore/accelerator_native/SyclAccelerator.h"
16#include "arcane/accelerator/core/Runner.h"
17#include "arcane/accelerator/core/RunQueue.h"
18#include "arcane/accelerator/RunCommandLoop.h"
19#include "arcane/accelerator/Reduce.h"
20#include "arcane/accelerator/Scan.h"
22#include "arcane/utils/NumArray.h"
31extern "C" int arcaneTestSycl1()
34 std::cout <<
"TEST1\n";
37 std::cout <<
"Device: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
39 int* data = sycl::malloc_shared<int>(N, q);
41 for (
int i = 0; i < N; i++)
44 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
49 for (
int i = 0; i < N; i++)
50 std::cout << data[i] << std::endl;
60extern "C" int arcaneTestSycl2()
63 std::cout <<
"TEST 2\n";
69 for (
int i = 0; i < N; i++)
73 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
78 for (
int i = 0; i < N; i++)
79 std::cout << data[i] << std::endl;
88extern "C" int arcaneTestSycl3()
91 std::cout <<
"TEST 3\n";
99 for (
int i = 0; i < N; i++)
112 for (
int i = 0; i < N; i++)
113 std::cout << data[i] << std::endl;
121extern "C" int arcaneTestSycl4()
125 constexpr Int32 BLOCK_SIZE = 128;
128 const int NB_BLOCK = 152;
129 const int N = BLOCK_SIZE * NB_BLOCK;
130 std::cout <<
"TEST 4\n";
132 sycl::device device{ sycl::gpu_selector_v };
133 Int64 mcu = device.get_info<sycl::info::device::max_compute_units>();
134 Int64 mwg = device.get_info<sycl::info::device::max_work_group_size>();
135 std::cout <<
"DEVICE mcu=" << mcu <<
" mwg=" << mwg <<
"\n";
136 sycl::queue q{ device };
142 atomic_counter.resize(1);
144 Int64 ref_total_reduce = 0;
145 for (
int i = 0; i < N; i++) {
146 data_to_reduce[i] = i;
147 ref_total_reduce += data_to_reduce[i];
150 Span<Int32> out_atomic_counter(atomic_counter.to1DSpan());
152 q.single_task([=]() {
153 out_atomic_counter[0] = 0;
157 const int nb_iter = 1;
158 for (Int32 iter = 0; iter < nb_iter; ++iter) {
160 Span<Int32> in_data_to_reduce(data_to_reduce.to1DSpan());
161 Span<Int64> inout_data_partial_reduce(data_partial_reduce.to1DSpan());
162 Int32* atomic_counter_ptr = out_atomic_counter.data();
163 q.parallel_for(sycl::nd_range<1>(N, BLOCK_SIZE), [=](sycl::nd_item<1>
id) {
164 Int32 i =
static_cast<Int32
>(
id.get_global_id());
165 const Int32 global_id =
static_cast<Int32
>(
id.get_global_id(0));
166 const Int32 local_id =
static_cast<Int32
>(
id.get_local_id(0));
167 const Int32 group_id =
static_cast<Int32
>(
id.get_group_linear_id());
168 const Int32 sub_group_id =
static_cast<Int32
>(
id.get_sub_group().get_local_id());
169 Int32 nb_block =
static_cast<Int32
>(
id.get_group_range(0));
171 inout_data1(i, 0) = global_id;
172 inout_data1(i, 1) = local_id;
173 inout_data1(i, 2) = group_id;
174 inout_data1(i, 3) = sub_group_id;
175 inout_data1(i, 5) = 0;
176 Int32 v = in_data_to_reduce[i];
178 bool is_last =
false;
179 id.barrier(sycl::access::fence_space::local_space);
181 Int32 vx = sycl::reduce_over_group(
id.get_group(),v,sycl::plus<Int32>{});
182 inout_data1(i, 0) = vx;
188 inout_data1(i, 4) = local_sum;
189 inout_data_partial_reduce[group_id] = local_sum;
190 sycl::atomic_ref<Int32, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
191 Int32 cx = a.fetch_add(1);
192 inout_data1(i, 5) = cx;
193 if (cx == (nb_block - 1))
196 id.barrier(sycl::access::fence_space::local_space);
201 for (
int x = 0; x < nb_block; ++x)
202 my_total += inout_data_partial_reduce[x];
204 inout_data_partial_reduce[0] = my_total;
205 *atomic_counter_ptr = 0;
210 Int64 kernel_total = data_partial_reduce[0];
211 std::cout <<
"N=" << N <<
" REF_TOTAL=" << ref_total_reduce <<
" computed=" << kernel_total <<
"\n";
212 bool do_verbose =
true;
214 for (
int i = 0; i < N; i++) {
217 std::cout <<
"I=" << i <<
" global_id=" << data1(i, 0)
218 <<
" local_id=" << data1(i, 1)
219 <<
" group_id=" << data1(i, 2)
220 <<
" sub_group_local_id=" << data1(i, 3)
221 <<
" v=" << data1(i, 4)
225 std::cout <<
"FINAL_N=" << N <<
" REF_TOTAL=" << ref_total_reduce <<
" computed=" << kernel_total <<
"\n";
232extern "C" void arcaneTestSycl5()
236 constexpr int N = 25;
240 for (
int i = 0; i < N; i++)
250 reducer1.add(inout_data[i]);
259extern "C" void arcaneTestSycl6()
266 constexpr int N = 4789;
276 Int64 total_exclusive = 7;
277 for (
int i = 0; i < N; i++) {
278 expected_exclusive_data[i] = total_exclusive;
281 total_exclusive += data[i];
282 expected_inclusive_data[i] = total;
285 Arcane::Accelerator::impl::SyclScanner<false, Int64, ScannerSumOperator<Int64>> scanner;
286 scanner.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
288 const bool do_verbose = (N < 256);
289 for (
int i = 0; i < N; i++) {
290 bool is_bad = out_data[i] != expected_inclusive_data[i];
291 if (do_verbose || is_bad)
292 std::cout <<
"OUT_INCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_inclusive_data[i] <<
"\n";
296 std::cout <<
"FINAL OUT_INCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_inclusive_data[N - 1] <<
"\n";
298 Arcane::Accelerator::impl::SyclScanner<true, Int64, ScannerSumOperator<Int64>> scanner2;
299 scanner2.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
301 for (
int i = 0; i < N; i++) {
302 bool is_bad = out_data[i] != expected_exclusive_data[i];
303 if (do_verbose || is_bad)
304 std::cout <<
"OUT_EXCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_exclusive_data[i] <<
"\n";
308 std::cout <<
"FINAL OUT_EXCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_exclusive_data[N - 1] <<
"\n";
311extern "C" void arcaneTestSycl7()
315 constexpr int N = 63;
328 Int64 total_exclusive = 7;
329 for (
int i = 0; i < N; i++) {
330 expected_exclusive_data[i] = total_exclusive;
333 total_exclusive += data[i];
334 expected_inclusive_data[i] = total;
337 Arcane::Accelerator::impl::SyclScanner<false, Int64, ScannerSumOperator<Int64>> scanner;
338 scanner.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
340 const bool do_verbose = (N < 256);
341 for (
int i = 0; i < N; i++) {
342 bool is_bad = out_data[i] != expected_inclusive_data[i];
343 if (do_verbose || is_bad)
344 std::cout <<
"OUT_INCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_inclusive_data[i] <<
"\n";
348 std::cout <<
"FINAL OUT_INCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_inclusive_data[N - 1] <<
"\n";
350 Arcane::Accelerator::impl::SyclScanner<true, Int64, ScannerSumOperator<Int64>> scanner2;
351 scanner2.doScan(queue, data.to1DSmallSpan(), out_data.to1DSmallSpan(), 7);
353 for (
int i = 0; i < N; i++) {
354 bool is_bad = out_data[i] != expected_exclusive_data[i];
355 if (do_verbose || is_bad)
356 std::cout <<
"OUT_EXCL=" << i <<
" v=" << out_data[i] <<
" expected=" << expected_exclusive_data[i] <<
"\n";
360 std::cout <<
"FINAL OUT_EXCL=" << (N - 1) <<
" v=" << out_data[N - 1] <<
" expected=" << expected_exclusive_data[N - 1] <<
"\n";
#define ARCANE_FATAL(...)
Macro envoyant une exception FatalErrorException.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle 1D sur accélérateur avec arguments supplémentaires.
Classe pour effectuer une réduction 'somme'.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Classe de base des vues multi-dimensionnelles.
Tableaux multi-dimensionnels pour les types numériques accessibles sur accélérateurs.
Vue d'un tableau d'éléments de type T.
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
@ SYCL
Politique d'exécution utilisant l'environnement SYCL.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ Device
Alloue sur le device.
Espace de nom de Arccore.