14#include "arcane/accelerator/sycl/SyclAccelerator.h"
16#include "arcane/accelerator/core/Runner.h"
17#include "arcane/accelerator/core/RunQueue.h"
21#include "arcane/utils/NumArray.h"
31extern "C" int arcaneTestSycl1()
34 std::cout <<
"TEST1\n";
37 std::cout <<
"Device: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
39 int* data = sycl::malloc_shared<int>(N, q);
41 for (
int i = 0; i < N; i++)
44 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
49 for (
int i = 0; i < N; i++)
50 std::cout << data[i] << std::endl;
60extern "C" int arcaneTestSycl2()
63 std::cout <<
"TEST 2\n";
69 for (
int i = 0; i < N; i++)
73 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
78 for (
int i = 0; i < N; i++)
79 std::cout << data[i] << std::endl;
88extern "C" int arcaneTestSycl3()
91 std::cout <<
"TEST 3\n";
99 for (
int i = 0; i < N; i++)
112 for (
int i = 0; i < N; i++)
113 std::cout << data[i] << std::endl;
121extern "C" int arcaneTestSycl4()
125 constexpr Int32 BLOCK_SIZE = 128;
128 const int NB_BLOCK = 152;
129 const int N = BLOCK_SIZE * NB_BLOCK;
130 std::cout <<
"TEST 4\n";
132 sycl::device device{ sycl::gpu_selector_v };
133 Int64 mcu = device.get_info<sycl::info::device::max_compute_units>();
134 Int64 mwg = device.get_info<sycl::info::device::max_work_group_size>();
135 std::cout <<
"DEVICE mcu=" << mcu <<
" mwg=" << mwg <<
"\n";
136 sycl::queue q{ device };
142 atomic_counter.resize(1);
144 Int64 ref_total_reduce = 0;
145 for (
int i = 0; i < N; i++) {
146 data_to_reduce[i] = i;
147 ref_total_reduce += data_to_reduce[i];
150 Span<Int32> out_atomic_counter(atomic_counter.to1DSpan());
152 q.single_task([=]() {
153 out_atomic_counter[0] = 0;
157 const int nb_iter = 1;
158 for (Int32 iter = 0; iter < nb_iter; ++iter) {
160 Span<Int32> in_data_to_reduce(data_to_reduce.to1DSpan());
161 Span<Int64> inout_data_partial_reduce(data_partial_reduce.to1DSpan());
162 Int32* atomic_counter_ptr = out_atomic_counter.data();
163 q.parallel_for(sycl::nd_range<1>(N, BLOCK_SIZE), [=](sycl::nd_item<1>
id) {
164 Int32 i =
static_cast<Int32
>(
id.get_global_id());
165 const Int32 global_id =
static_cast<Int32
>(
id.get_global_id(0));
166 const Int32 local_id =
static_cast<Int32
>(
id.get_local_id(0));
167 const Int32 group_id =
static_cast<Int32
>(
id.get_group_linear_id());
168 const Int32 sub_group_id =
static_cast<Int32
>(
id.get_sub_group().get_local_id());
169 Int32 nb_block =
static_cast<Int32
>(
id.get_group_range(0));
171 inout_data1(i, 0) = global_id;
172 inout_data1(i, 1) = local_id;
173 inout_data1(i, 2) = group_id;
174 inout_data1(i, 3) = sub_group_id;
175 inout_data1(i, 5) = 0;
176 Int32 v = in_data_to_reduce[i];
178 bool is_last =
false;
179 id.barrier(sycl::access::fence_space::local_space);
181 Int32 vx = sycl::reduce_over_group(
id.get_group(),v,sycl::plus<Int32>{});
182 inout_data1(i, 0) = vx;
188 inout_data1(i, 4) = local_sum;
189 inout_data_partial_reduce[group_id] = local_sum;
190 sycl::atomic_ref<Int32, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
191 Int32 cx = a.fetch_add(1);
192 inout_data1(i, 5) = cx;
193 if (cx == (nb_block - 1))
196 id.barrier(sycl::access::fence_space::local_space);
201 for (
int x = 0; x < nb_block; ++x)
202 my_total += inout_data_partial_reduce[x];
204 inout_data_partial_reduce[0] = my_total;
205 *atomic_counter_ptr = 0;
210 Int64 kernel_total = data_partial_reduce[0];
211 std::cout <<
"N=" << N <<
" REF_TOTAL=" << ref_total_reduce <<
" computed=" << kernel_total <<
"\n";
212 bool do_verbose =
true;
214 for (
int i = 0; i < N; i++) {
217 std::cout <<
"I=" << i <<
" global_id=" << data1(i, 0)
218 <<
" local_id=" << data1(i, 1)
219 <<
" group_id=" << data1(i, 2)
220 <<
" sub_group_local_id=" << data1(i, 3)
221 <<
" v=" << data1(i, 4)
225 std::cout <<
"FINAL_N=" << N <<
" REF_TOTAL=" << ref_total_reduce <<
" computed=" << kernel_total <<
"\n";
229extern "C" void arcaneTestSycl5()
233 constexpr int N = 25;
237 for (
int i = 0; i < N; i++)
247 reducer1.add(inout_data[i]);
Types et fonctions pour gérer les synchronisations sur les accélérateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle sur accélérateur avec arguments supplémentaires pour les réductions.
Classe pour effectuer une réduction 'somme'.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Classe de base des vues multi-dimensionnelles.
Tableaux multi-dimensionnels pour les types numériques accessibles sur accélérateurs.
Vue d'un tableau d'éléments de type T.
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
@ SYCL
Politique d'exécution utilisant l'environnement SYCL.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
@ Device
Alloue sur le device.
Espace de nom de Arccore.