Arcane  v3.15.0.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
Test.sycl.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2024 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Test.sycl.cc (C) 2000-2024 */
9/* */
10/* Fichier contenant les tests pour l'implémentation SYCL. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/accelerator/sycl/SyclAccelerator.h"
15
16#include "arcane/accelerator/core/Runner.h"
17#include "arcane/accelerator/core/RunQueue.h"
20
21#include "arcane/utils/NumArray.h"
22
23using namespace Arccore;
24using namespace Arcane;
25using namespace Arcane::Accelerator;
26
27/*---------------------------------------------------------------------------*/
28/*---------------------------------------------------------------------------*/
29
30// Test Appel pure SYCL
31extern "C" int arcaneTestSycl1()
32{
33 const int N = 8;
34 std::cout << "TEST1\n";
35
36 sycl::queue q;
37 std::cout << "Device: " << q.get_device().get_info<sycl::info::device::name>() << std::endl;
38
39 int* data = sycl::malloc_shared<int>(N, q);
40
41 for (int i = 0; i < N; i++)
42 data[i] = i;
43
44 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
45 data[i] *= 2;
46 })
47 .wait();
48
49 for (int i = 0; i < N; i++)
50 std::cout << data[i] << std::endl;
51 sycl::free(data, q);
52
53 return 0;
54}
55
56/*---------------------------------------------------------------------------*/
57/*---------------------------------------------------------------------------*/
58
59// Idem Test1 avec des NumArray
60extern "C" int arcaneTestSycl2()
61{
62 const int N = 8;
63 std::cout << "TEST 2\n";
64
65 sycl::queue q;
66
68
69 for (int i = 0; i < N; i++)
70 data[i] = i;
71
72 Span<Int32> inout_data(data.to1DSpan());
73 q.parallel_for(sycl::range<1>(N), [=](sycl::id<1> i) {
74 inout_data[i] *= 3;
75 })
76 .wait();
77
78 for (int i = 0; i < N; i++)
79 std::cout << data[i] << std::endl;
80
81 return 0;
82}
83
84/*---------------------------------------------------------------------------*/
85/*---------------------------------------------------------------------------*/
86
87// Idem Test1 avec des NumArray
88extern "C" int arcaneTestSycl3()
89{
90 const int N = 12;
91 std::cout << "TEST 3\n";
92
93 Runner runner_sycl(eExecutionPolicy::SYCL);
95 sycl::queue q;
96
98
99 for (int i = 0; i < N; i++)
100 data[i] = i;
101
102 {
103 auto command = makeCommand(queue);
104 Span<Int32> inout_data(data.to1DSpan());
105 command << RUNCOMMAND_LOOP1(iter, N)
106 {
107 auto [i] = iter();
108 inout_data[i] *= 4;
109 };
110 }
111
112 for (int i = 0; i < N; i++)
113 std::cout << data[i] << std::endl;
114
115 return 0;
116}
117
118/*---------------------------------------------------------------------------*/
119/*---------------------------------------------------------------------------*/
120
121extern "C" int arcaneTestSycl4()
122{
123 // device.get_info<cl::sycl::info::device::max_work_group_size>();
124 //constexpr Int32 WARP_SIZE = 32;
125 constexpr Int32 BLOCK_SIZE = 128;
126
127 //const int nb_block = 152 * 15 * 12;
128 const int NB_BLOCK = 152;
129 const int N = BLOCK_SIZE * NB_BLOCK;
130 std::cout << "TEST 4\n";
131
132 sycl::device device{ sycl::gpu_selector_v };
133 Int64 mcu = device.get_info<sycl::info::device::max_compute_units>();
134 Int64 mwg = device.get_info<sycl::info::device::max_work_group_size>();
135 std::cout << "DEVICE mcu=" << mcu << " mwg=" << mwg << "\n";
136 sycl::queue q{ device };
137
141 NumArray<Int32, MDDim1> atomic_counter(eMemoryRessource::Device);
142 atomic_counter.resize(1);
143
145 for (int i = 0; i < N; i++) {
146 data_to_reduce[i] = i;
148 }
149
151 {
152 q.single_task([=]() {
153 out_atomic_counter[0] = 0;
154 });
155 }
156
157 const int nb_iter = 1;
158 for (Int32 iter = 0; iter < nb_iter; ++iter) {
163 q.parallel_for(sycl::nd_range<1>(N, BLOCK_SIZE), [=](sycl::nd_item<1> id) {
164 Int32 i = static_cast<Int32>(id.get_global_id());
165 const Int32 global_id = static_cast<Int32>(id.get_global_id(0));
166 const Int32 local_id = static_cast<Int32>(id.get_local_id(0));
167 const Int32 group_id = static_cast<Int32>(id.get_group_linear_id());
168 const Int32 sub_group_id = static_cast<Int32>(id.get_sub_group().get_local_id());
169 Int32 nb_block = static_cast<Int32>(id.get_group_range(0));
170 //Int32 nb_thread = static_cast<Int32>(id.get_local_range(0));
171 inout_data1(i, 0) = global_id;
172 inout_data1(i, 1) = local_id;
173 inout_data1(i, 2) = group_id;
175 inout_data1(i, 5) = 0;
177 Int32 local_sum = 0;
178 bool is_last = false;
179 id.barrier(sycl::access::fence_space::local_space);
180 //Int32 v2_bis = id.get_sub_group().shuffle_down(v,1);
181 Int32 vx = sycl::reduce_over_group(id.get_group(),v,sycl::plus<Int32>{});
182 inout_data1(i, 0) = vx;
183 if (local_id == 0) {
184 //Int32 base = global_id;
185 //for (Int32 x = 0; x < nb_thread; ++x)
186 //local_sum += in_data_to_reduce[x + base];
187 local_sum = vx;
188 inout_data1(i, 4) = local_sum;
190 sycl::atomic_ref<Int32, sycl::memory_order::relaxed, sycl::memory_scope::device> a(*atomic_counter_ptr);
191 Int32 cx = a.fetch_add(1);
192 inout_data1(i, 5) = cx;
193 if (cx == (nb_block - 1))
194 is_last = true;
195 }
196 id.barrier(sycl::access::fence_space::local_space);
197 // Je suis le dernier à faire la réduction.
198 // Calcule la réduction finale
199 if (is_last) {
200 Int64 my_total = 0;
201 for (int x = 0; x < nb_block; ++x)
203 // Met le résultat final dans le premier élément du tableau.
206 }
207 })
208 .wait();
209 }
211 std::cout << "N=" << N << " REF_TOTAL=" << ref_total_reduce << " computed=" << kernel_total << "\n";
212 bool do_verbose = true;
213 if (do_verbose) {
214 for (int i = 0; i < N; i++) {
215 Int32 imod = i % 32;
216 if (imod < 2)
217 std::cout << "I=" << i << " global_id=" << data1(i, 0)
218 << " local_id=" << data1(i, 1)
219 << " group_id=" << data1(i, 2)
220 << " sub_group_local_id=" << data1(i, 3)
221 << " v=" << data1(i, 4)
222 << std::endl;
223 }
224 }
225 std::cout << "FINAL_N=" << N << " REF_TOTAL=" << ref_total_reduce << " computed=" << kernel_total << "\n";
226 return 0;
227}
228
229extern "C" void arcaneTestSycl5()
230{
231 Runner runner(eExecutionPolicy::SYCL);
232 RunQueue queue{ makeQueue(runner) };
233 constexpr int N = 25;
234
236
237 for (int i = 0; i < N; i++)
238 data[i] = i;
239
240 {
241 auto command = makeCommand(queue);
242 Span<Int32> inout_data(data.to1DSpan());
244 command << RUNCOMMAND_LOOP1(iter, N)
245 {
246 auto [i] = iter();
247 reducer1.add(inout_data[i]);
248 inout_data[i] *= 4;
249 };
250 }
251}
Types et fonctions pour gérer les synchronisations sur les accélérateurs.
Types et macros pour gérer les boucles sur les accélérateurs.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
Boucle sur accélérateur avec arguments supplémentaires pour les réductions.
File d'exécution pour un accélérateur.
Gestionnaire d'exécution pour accélérateur.
Definition core/Runner.h:68
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Definition Lima.cc:149
Espace de nom pour l'utilisation des accélérateurs.
RunCommand makeCommand(const RunQueue &run_queue)
Créé une commande associée à la file run_queue.
RunQueue makeQueue(const Runner &runner)
Créé une file associée à runner.
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
Espace de nom de Arccore.
Definition ArcaneTypes.h:24