Alien  1.3.0
Developer documentation
Loading...
Searching...
No Matches
SYCLKernelInternal.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7
8#pragma once
9
10#include <alien/kernels/sycl/SYCLPrecomp.h>
11
12#ifdef USE_SYCL2020
13#include <sycl/sycl.hpp>
14#else
15#include <CL/sycl.hpp>
16#endif
17/*---------------------------------------------------------------------------*/
18
19namespace Alien::SYCLInternal
20{
21
22#ifndef USE_SYCL2020
23 using namespace cl ;
24#endif
25/*---------------------------------------------------------------------------*/
26namespace mi300 {
27
28static constexpr int WAVEFRONT = 64; // WF natif gfx942
29static constexpr int N_WARPS_BLOC = WG_SIZE / WAVEFRONT; // 4
30
31 template<typename ValueT, typename ReadAccessorT, typename ReadWriteAccessorT>
32 void dot_kernel_mi300(
33 sycl::nd_item<1> item,
34 ReadAccessorT x,
35 ReadAccessorT y,
36 ReadWriteAccessorT partials,
37 size_t N,
38 sycl::local_accessor<ValueT,1> lds)
39 {
40 const size_t gid = item.get_global_id(0);
41 const size_t stride = item.get_global_range(0);
42 const size_t lid = item.get_local_id(0);
43
44 // ── Phase 1a : accumulation avec unroll ───────────────────────────────
45 // Sur MI300, le backend HIP émet des GLOBAL_LOAD_DWORDX2 (64-bit loads).
46 // ITEMS_PER_WI=8 → 8 loads double indépendants = ILP pour masquer ~400 cy HBM3.
47 // La boucle strided garantit la coalescence : WI consécutifs (gid, gid+1, ...)
48 // accèdent à des adresses consécutives → 1 transaction mémoire pour 64 WI.
49 ValueT acc = 0;
50
51 const size_t stride_wi = stride * ITEMS_PER_WI;
52 size_t i = gid * ITEMS_PER_WI;
53
54 for (; i + ITEMS_PER_WI * stride <= N; i += stride_wi) {
55 // Unroll 8× — le compilateur AMDGCNx émet des v_fma_f64 indépendants
56 // → jusqu'à 2 FMA doubles/cycle sur les SIMD gfx942
57 #pragma unroll
58 for (size_t k = 0; k < ITEMS_PER_WI; ++k)
59 acc += x[i + k * stride] * y[i + k * stride];
60 }
61 // Epilogue
62 for (size_t j = gid; j < N; j += stride)
63 acc += x[j] * y[j];
64
65 // ── Phase 1b : réduction wavefront (sub_group) ────────────────────────
66 // Sur gfx942, reduce_over_group<plus<double>> se compile en :
67 // ds_swizzle_b32 / v_readlane_b32 / v_writelane_b32
68 // soit 6 niveaux de shuffle pour 64 threads — zéro accès LDS ici.
69 // IMPORTANT : sub_group_size(64) est annoté sur le kernel pour forcer
70 // l'émission de la séquence WF=64 (vs WF=32 qui nécessiterait 5 niveaux
71 // mais casserait la coalescence des loads précédents).
72 auto sg = item.get_sub_group();
73 acc = reduce_over_group(sg, acc, sycl::plus<ValueT>{});
74
75 // ── Phase 1c : réduction inter-wavefront via LDS ──────────────────────
76 // 4 wavefronts → 4 valeurs LDS = 32 bytes (tient dans 1 cache line LDS)
77 const size_t wf_id = lid / WAVEFRONT;
78 const size_t lane_id = lid % WAVEFRONT;
79
80 if (lane_id == 0)
81 lds[wf_id] = acc;
82
83 item.barrier(sycl::access::fence_space::local_space);
84
85 // Le premier wavefront charge les 4 partiels LDS et les réduit
86 if (wf_id == 0) {
87 // Lane 0..3 chargent les 4 partiels, lanes 4..63 chargent 0
88 ValueT val = (lid < N_WARPS_BLOC) ? lds[lid] : 0.0;
89 // Réduction sur le sous-groupe — sur les 64 lanes du WF0
90 // seuls les 4 premiers portent des données, le reste est neutre (0.0)
91 val = sycl::reduce_over_group(sg, val, sycl::plus<ValueT>{});
92 if (lane_id == 0)
93 partials[item.get_group(0)] = val;
94 }
95 }
96
97 // ---------------------------------------------------------------------------
98 // Variante avec double buffering explicite — masque davantage la latence HBM3
99 // Recommandée pour très grands N (> 500 M doubles) où la latence domine.
100 // ---------------------------------------------------------------------------
101 template<typename ValueT, typename ReadAccessorT, typename ReadWriteAccessorT>
102 void dot_kernel_mi300_doublebuf(
103 sycl::nd_item<1> item,
104 ReadAccessorT x,
105 ReadAccessorT y,
106 ReadWriteAccessorT partials,
107 size_t N,
108 sycl::local_accessor<ValueT,1> lds)
109 {
110 const size_t gid = item.get_global_id(0);
111 const size_t stride = item.get_global_range(0);
112 const size_t lid = item.get_local_id(0);
113
114 ValueT acc0 = 0, acc1 = 0; // Deux accumulateurs → 2 chaînes FMA indép.
115
116 // Double-stride : deux éléments par itération sur des adresses distantes
117 // → deux séquences de loads indépendantes dans le scoreboard
118 for (size_t i = gid; i + stride < N; i += 2 * stride) {
119 ValueT x0 = x[i], y0 = y[i];
120 ValueT x1 = x[i + stride], y1 = y[i + stride];
121 acc0 += x0 * y0;
122 acc1 += x1 * y1;
123 }
124 // Fusionner + epilogue
125 ValueT acc = acc0 + acc1;
126 for (size_t i = gid + ((N / (2*stride)) * 2 * stride); i < N; i += stride)
127 acc += x[i] * y[i];
128
129 auto sg = item.get_sub_group();
130 acc = sycl::reduce_over_group(sg, acc, sycl::plus<ValueT>{});
131
132 const size_t wf_id = lid / WAVEFRONT;
133 const size_t lane_id = lid % WAVEFRONT;
134 if (lane_id == 0) lds[wf_id] = acc;
135 item.barrier(sycl::access::fence_space::local_space);
136 if (wf_id == 0) {
137 ValueT val = (lid < N_WARPS_BLOC) ? lds[lid] : 0.0;
138 val = reduce_over_group(sg, val, sycl::plus<double>{});
139 if (lane_id == 0)
140 partials[item.get_group(0)] = val;
141 }
142 }
143}
144
145
146namespace h100 {
147
148template<typename ValueT, typename ReadAccessorT, typename ReadWriteAccessorT>
149void dot_kernel_h100(
150 sycl::nd_item<1> item,
151 ReadAccessorT x,
152 ReadAccessorT y,
153 ReadWriteAccessorT partials, // USM device — 1 double par bloc
154 std::size_t N,
155 sycl::local_accessor<ValueT,1> lds) // LDS : WG_SIZE/WARP_SIZE doubles
156{
157 using namespace std;
158 using namespace sycl;
159 const size_t gid = item.get_global_id(0);
160 const size_t stride = item.get_global_range(0);
161 const size_t lid = item.get_local_id(0);
162
163 // ── Phase 1a : accumulation locale avec unroll ─────────────────────────
164 // Chaque WI traite ITEMS_PER_WI éléments strided pour coalescence.
165 // Le compilateur NVPTX génère des LDG.128 (128-bit loads) si aligné.
166 double acc = 0.0;
167
168 // Boucle principale — stride de global_range pour couvrir tout N
169 size_t i = gid * ITEMS_PER_WI;
170 const size_t stride_wi = stride * ITEMS_PER_WI;
171
172 for (; i + ITEMS_PER_WI * stride <= N; i += stride_wi) {
173 // Unroll explicite — 8 FMA indépendants → ILP maximal
174 // Le compilateur H100 peut émettre 4 FMA doubles/cycle/SM
175 #pragma unroll
176 for (size_t k = 0; k < ITEMS_PER_WI; ++k)
177 acc += x[i + k * stride] * y[i + k * stride];
178 }
179 // Epilogue — éléments restants
180 for (size_t j = gid; j < N; j += stride)
181 acc += x[j] * y[j];
182
183 // ── Phase 1b : réduction warp-shuffle (sub_group) ─────────────────────
184 // sycl::reduce_over_group utilise les shuffles natifs PTX (__shfl_down)
185 // sur H100 — 5 niveaux pour 32 threads, zéro accès mémoire.
186 auto sg = item.get_sub_group();
187 acc = sycl::reduce_over_group(sg, acc, sycl::plus<ValueT>{});
188
189 // ── Phase 1c : réduction inter-warp via LDS ───────────────────────────
190 // Un seul thread leader par warp écrit en LDS.
191 // LDS = WG_SIZE/WARP_SIZE = 16 doubles = 128 bytes — tient dans 1 cache line
192 const size_t warp_id = lid / WARP_SIZE;
193 const size_t lane_id = lid % WARP_SIZE;
194 const size_t n_warps = WG_SIZE / WARP_SIZE; // 16
195
196 if (lane_id == 0)
197 lds[warp_id] = acc;
198
199 item.barrier(access::fence_space::local_space);
200
201 // Réduction finale des 16 valeurs LDS par le premier warp
202 if (warp_id == 0) {
203 ValueT val = (lid < n_warps) ? lds[lid] : 0.0;
204 val = reduce_over_group(sg, val, sycl::plus<ValueT>{});
205 if (lane_id == 0)
206 partials[item.get_group(0)] = val;
207 }
208}
209}
210
211template <typename T>
212class Future
213{
214 public:
215 using FunctionType = std::function<T(sycl::event&,sycl::buffer<T>&,std::size_t)>;
216 Future(T& value)
217 : m_value(value)
218 //, m_d_value{1}
219 , m_d_value{ SYCLEnv::instance()->maxNumGroups() * TARGET_WAVES }
220 {
221 }
222
223 T& operator()()
224 {
225 return m_value;
226 }
227
228 T operator()() const
229 {
230 return m_value;
231 }
232
233 T get()
234 {
235 if (m_parallel_mng)
236 {
237 Arccore::MessagePassing::mpWait(m_parallel_mng, m_request);
238 m_parallel_mng = nullptr;
239 }
240 else
241 {
242 if(m_wait_function)
243 {
244 m_value = m_wait_function(m_event,m_d_value,m_num_blocks) ;
245 m_wait_function = nullptr ;
246 }
247 }
248 return m_value;
249 }
250
251 sycl::buffer<T, 1>& deviceValue()
252 {
253 return m_d_value;
254 }
255
256 sycl::event& event() {
257 return m_event;
258 }
259
260
261 void addRequest(Arccore::MessagePassing::IMessagePassingMng* parallel_mng,
262 Arccore::MessagePassing::Request request)
263 {
264 m_parallel_mng = parallel_mng;
265 m_request = request;
266 }
267
268 void setWaitFunction(FunctionType wait_function)
269 {
270 m_wait_function = wait_function;
271 }
272
273 void setNumBlocks(std::size_t num_blocks)
274 {
275 m_num_blocks = num_blocks;
276 }
277
278 private:
279 T& m_value;
280 sycl::buffer<T, 1> m_d_value;
281 sycl::event m_event;
282 FunctionType m_wait_function = nullptr;
283 std::size_t m_num_blocks = SYCLEnv::instance()->maxNumGroups() * TARGET_WAVES ;
284
285 Arccore::MessagePassing::IMessagePassingMng* m_parallel_mng = nullptr;
286 Arccore::MessagePassing::Request m_request;
287};
288
289class KernelInternal
290{
291 private:
292 int m_dot_algo = 3;
293
294 public:
295 KernelInternal()
296 {
297 m_env = SYCLEnv::instance();
298
299 // clang-format off
300 m_max_num_groups = m_env->maxNumGroups() ;
301 m_max_work_group_size = m_env->maxWorkGroupSize() ;
302 m_total_threads = m_env->maxNumThreads() ;
303 // clang-format on
304 }
305
306 virtual ~KernelInternal() {}
307
308 void setDotAlgo(int dot_algo)
309 {
310 m_dot_algo = dot_algo;
311 }
312
313 template <typename T>
314 void assign(T const a,
315 sycl::buffer<T>& y)
316 {
317 m_env->internal()->queue().submit(
318 [&](sycl::handler& cgh)
319 {
320 auto acc = y.template get_access<sycl::access::mode::discard_write>(cgh);
321 cgh.fill(acc, a);
322 });
323 /*
324 sycl::range<1> work_items{ m_total_threads };
325 {
326 // clang-format off
327 m_env->internal()->queue().submit( [&](sycl::handler& cgh)
328 {
329 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
330 auto y_length = y.size() ;
331 cgh.parallel_for<class vector_assign>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
332 {
333 auto id = itemId.get_id(0);
334 for (auto i = id; i < y_length; i += itemId.get_range()[0])
335 access_y[i] = a;
336 });
337 });
338 // clang-format on
339 }*/
340 }
341
342 template <typename T, typename Lambda>
343 void apply(Lambda const& lambda,
344 sycl::buffer<T>& y)
345 {
346 sycl::range<1> work_items{ m_total_threads };
347 {
348 // clang-format off
349 m_env->internal()->queue().submit( [&](sycl::handler& cgh)
350 {
351 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
352 auto y_length = y.size() ;
353 cgh.parallel_for<class vector_apply>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
354 {
355 auto id = itemId.get_id(0);
356 for (auto i = id; i < y_length; i += itemId.get_range()[0])
357 access_y[i] = lambda(i);
358 });
359 });
360 // clang-format on
361 }
362 }
363
364#ifdef NAIVE
365 template <typename T>
366 void scal(T a,
367 sycl::buffer<T>& y)
368 {
369 sycl::range<1> work_items{ m_total_threads };
370 {
371 // clang-format off
372 m_env->internal()->queue().submit([&](sycl::handler& cgh)
373 {
374 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
375 auto y_length = y.size() ;
376 cgh.parallel_for<class vector_scal>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
377 {
378 auto id = itemId.get_id(0);
379 for (auto i = id; i < y_length; i += itemId.get_range()[0])
380 access_y[i] = a*access_y[i];
381 });
382 });
383 // clang-format on
384 }
385 }
386
387 template <typename T>
388 void axpy(T const a,
389 sycl::buffer<T>& x,
390 sycl::buffer<T>& y)
391 {
392 sycl::range<1> work_items{ m_total_threads };
393 {
394 // clang-format off
395 m_env->internal()->queue().submit([&](sycl::handler& cgh)
396 {
397 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
398 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
399 auto y_length = y.size() ;
400 cgh.parallel_for<class vector_axpy>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
401 {
402 auto id = itemId.get_id(0);
403 for (auto i = id; i < y_length; i += itemId.get_range()[0])
404 access_y[i] += a * access_x[i];
405 });
406 });
407 // clang-format on
408 }
409 }
410#endif
411 template <typename T>
412 void scal(T a,
413 sycl::buffer<T>& y)
414 {
415 using VecT = sycl::vec<T, 2>;
416
417 auto& queue = m_env->internal()->queue();
418 const size_t n = y.size();
419 const size_t n2 = n / 2; // éléments traités vectoriellement
420 const size_t tail = n % 2;
421
422 static constexpr size_t WG_SIZE = 256;
423 const size_t blocks = std::max((n2 + WG_SIZE - 1) / WG_SIZE,
424 m_max_num_groups * 4UL);
425 const size_t total = blocks * WG_SIZE;
426
427 // ── Reinterprétation des buffers en vec<T,2> ──────────────────────────
428 // Requiert que les buffers soient alignés 16 bytes (garantit par
429 // sycl::buffer qui aligne sur max_required_work_group_size).
430 sycl::buffer<VecT> y2{ y.template reinterpret<VecT>(sycl::range<1>{n2}) };
431
432 queue.submit([&](sycl::handler& cgh) {
433 auto ay = y2.template get_access<sycl::access::mode::read_write>(cgh);
434
435 cgh.parallel_for<class vector_xcal_vec>(
436 sycl::nd_range<1>{ {total}, {WG_SIZE} },
437 [=](sycl::nd_item<1> item) {
438 const size_t stride = item.get_global_range()[0];
439 for (size_t i = item.get_global_id(0); i < n2; i += stride) {
440 // Un seul load 128-bit pour x, un pour y
441 ay[i] = VecT{a, a} * ay[i];
442 }
443 });
444 });
445
446 // Epilogue scalaire si n est impair (rare en pratique)
447 if (tail) {
448 sycl::buffer<T> yt{ y.template reinterpret<T>(sycl::range<1>{n}) };
449 queue.submit([&](sycl::handler& cgh) {
450 auto ay = yt.template get_access<sycl::access::mode::read_write>(cgh);
451 cgh.single_task([=]{ ay[n-1] = a * ay[n-1]; });
452 });
453 }
454
455 }
456
457 template <typename T>
458 void axpy(T const a,
459 sycl::buffer<T>& x,
460 sycl::buffer<T>& y)
461 {
462 using VecT = sycl::vec<T, 2>;
463
464 auto& queue = m_env->internal()->queue();
465 const size_t n = y.size();
466 const size_t n2 = n / 2; // éléments traités vectoriellement
467 const size_t tail = n % 2;
468
469 static constexpr size_t WG_SIZE = 256;
470 const size_t blocks = std::max((n2 + WG_SIZE - 1) / WG_SIZE,
471 m_max_num_groups * 4UL);
472 const size_t total = blocks * WG_SIZE;
473
474 // ── Reinterprétation des buffers en vec<T,2> ──────────────────────────
475 // Requiert que les buffers soient alignés 16 bytes (garantit par
476 // sycl::buffer qui aligne sur max_required_work_group_size).
477 sycl::buffer<VecT> x2{ x.template reinterpret<VecT>(sycl::range<1>{n2}) };
478 sycl::buffer<VecT> y2{ y.template reinterpret<VecT>(sycl::range<1>{n2}) };
479
480 queue.submit([&](sycl::handler& cgh) {
481 auto ax = x2.template get_access<sycl::access::mode::read>(cgh);
482 auto ay = y2.template get_access<sycl::access::mode::read_write>(cgh);
483
484 cgh.parallel_for<class vector_axpy_vec>(
485 sycl::nd_range<1>{ {total}, {WG_SIZE} },
486 [=](sycl::nd_item<1> item) {
487 const size_t stride = item.get_global_range()[0];
488 for (size_t i = item.get_global_id(0); i < n2; i += stride) {
489 // Un seul load 128-bit pour x, un pour y
490 ay[i] = ay[i] + VecT{a, a} * ax[i]; // fused multiply-add
491 }
492 });
493 });
494
495 // Epilogue scalaire si n est impair (rare en pratique)
496 if (tail) {
497 sycl::buffer<T> xt{ x.template reinterpret<T>(sycl::range<1>{n}) };
498 sycl::buffer<T> yt{ y.template reinterpret<T>(sycl::range<1>{n}) };
499 queue.submit([&](sycl::handler& cgh) {
500 auto ax = xt.template get_access<sycl::access::mode::read>(cgh);
501 auto ay = yt.template get_access<sycl::access::mode::read_write>(cgh);
502 cgh.single_task([=]{ ay[n-1] += a * ax[n-1]; });
503 });
504 }
505 }
506
507 template <typename T>
508 void axpy(T const a,
509 sycl::buffer<T>& x,
510 Integer stride_x,
511 sycl::buffer<T>& y,
512 Integer stride_y)
513 {
514 sycl::range<1> work_items{ m_total_threads };
515 {
516 // clang-format off
517 m_env->internal()->queue().submit([&](sycl::handler& cgh)
518 {
519 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
520 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
521 auto x_length = x.size()/stride_x ;
522 cgh.parallel_for<class vector_axpy>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
523 {
524 auto id = itemId.get_id(0);
525 for (auto i = id; i < x_length; i += itemId.get_range()[0])
526 access_y[i*stride_y] += a * access_x[i*stride_x];
527 });
528 });
529 // clang-format on
530 }
531 }
532 template <typename T>
533 void pointwiseMult(sycl::buffer<T>& x,
534 sycl::buffer<T>& y,
535 sycl::buffer<T>& z)
536 {
537#ifdef NAIVE
538 sycl::range<1> work_items{ m_total_threads };
539 {
540 // clang-format off
541 m_env->internal()->queue().submit([&](sycl::handler& cgh)
542 {
543 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
544 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
545 auto access_z = z.template get_access<sycl::access::mode::read_write>(cgh);
546 auto y_length = y.size() ;
547 cgh.parallel_for<class vector_pointwizemult>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
548 {
549 auto id = itemId.get_id(0);
550 for (auto i = id; i < y_length; i += itemId.get_range()[0])
551 access_z[i] = access_x[i] * access_y[i];
552 });
553 });
554 // clang-format on
555 }
556#endif
557 using VecT = sycl::vec<T, 2>;
558
559 auto& queue = m_env->internal()->queue();
560 const size_t n = y.size();
561 const size_t n2 = n / 2; // éléments traités vectoriellement
562 const size_t tail = n % 2;
563
564 static constexpr size_t WG_SIZE = 256;
565 const size_t blocks = std::max((n2 + WG_SIZE - 1) / WG_SIZE,
566 m_max_num_groups * 4UL);
567 const size_t total = blocks * WG_SIZE;
568
569 // ── Reinterprétation des buffers en vec<T,2> ──────────────────────────
570 // Requiert que les buffers soient alignés 16 bytes (garantit par
571 // sycl::buffer qui aligne sur max_required_work_group_size).
572 sycl::buffer<VecT> x2{ x.template reinterpret<VecT>(sycl::range<1>{n2}) };
573 sycl::buffer<VecT> y2{ y.template reinterpret<VecT>(sycl::range<1>{n2}) };
574 sycl::buffer<VecT> z2{ z.template reinterpret<VecT>(sycl::range<1>{n2}) };
575
576 queue.submit([&](sycl::handler& cgh) {
577 auto ax = x2.template get_access<sycl::access::mode::read>(cgh);
578 auto ay = y2.template get_access<sycl::access::mode::read>(cgh);
579 auto az = z2.template get_access<sycl::access::mode::read_write>(cgh);
580
581 cgh.parallel_for<class vector_pointwizemult>(
582 sycl::nd_range<1>{ {total}, {WG_SIZE} },
583 [=](sycl::nd_item<1> item) {
584 const size_t stride = item.get_global_range()[0];
585 for (size_t i = item.get_global_id(0); i < n2; i += stride) {
586 // Un seul load 128-bit pour x, un pour y
587 az[i] = ax[i] * ay[i]; // fused multiply-add
588 }
589 });
590 });
591
592 // Epilogue scalaire si n est impair (rare en pratique)
593 if (tail) {
594 sycl::buffer<T> xt{ x.template reinterpret<T>(sycl::range<1>{n}) };
595 sycl::buffer<T> yt{ y.template reinterpret<T>(sycl::range<1>{n}) };
596 sycl::buffer<T> zt{ z.template reinterpret<T>(sycl::range<1>{n}) };
597 queue.submit([&](sycl::handler& cgh) {
598 auto ax = xt.template get_access<sycl::access::mode::read>(cgh);
599 auto ay = yt.template get_access<sycl::access::mode::read>(cgh);
600 auto az = zt.template get_access<sycl::access::mode::read_write>(cgh);
601 cgh.single_task([=]{ az[n-1] = ax[n-1] * ay[n-1]; });
602 });
603 }
604
605#ifdef PRINT_DEBUG_INFO
606 {
607 sycl::host_accessor<T, 1, sycl::access::mode::read> x_acc(x);
608 sycl::host_accessor<T, 1, sycl::access::mode::read> y_acc(y);
609 sycl::host_accessor<T, 1, sycl::access::mode::read> z_acc(z);
610 for(int il=0;il<x.size();++il)
611 {
612 std::cout<<"X Y Z ["<<il<<"] : "<<x_acc[il]<<"*"<<y_acc[il]<<"="<<z_acc[il]<<std::endl ;
613 }
614 }
615#endif
616 }
617
618 template <typename T>
619 void copy(sycl::buffer<T>& x,
620 sycl::buffer<T>& y)
621 {
622 //sycl::range<1> work_items{ m_total_threads };
623 {
624 // clang-format off
625 m_env->internal()->queue().submit(
626 [&](sycl::handler& cgh)
627 {
628 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
629 auto access_y = y.template get_access<sycl::access::mode::discard_write>(cgh);
630 cgh.copy(access_x,access_y) ;
631 /*
632 auto y_length = y.size() ;
633 cgh.parallel_for<class vector_copy>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
634 {
635 auto id = itemId.get_id(0);
636 for (auto i = id; i < y_length; i += itemId.get_range()[0])
637 access_y[i] = access_x[i];
638 });
639 */
640 });
641 // clang-format on
642 }
643 }
644
645 template <typename T>
646 void copy(sycl::buffer<T>& x,
647 Integer stride_x,
648 sycl::buffer<T>& y,
649 Integer stride_y)
650 {
651 sycl::range<1> work_items{ m_total_threads };
652 {
653 // clang-format off
654 m_env->internal()->queue().submit( [&](sycl::handler& cgh)
655 {
656 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
657 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
658 auto x_length = x.size()/stride_x ;
659 cgh.parallel_for<class vector_copy>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
660 {
661 auto id = itemId.get_id(0);
662 for (auto i = id; i < x_length; i += itemId.get_range()[0])
663 access_y[i*stride_y] = access_x[i*stride_x];
664 });
665 });
666 // clang-format on
667 }
668 }
669
670 template <typename T>
672 {
673 };
674
675 // to make global size multiple of local size
676 template <typename index_t>
677 inline index_t round_up(const index_t x, const index_t y)
678 {
679 return ((x + y - 1) / y) * y;
680 }
681
682 template <typename T>
684
685 template <typename T>
686 T reduce_sum(sycl::buffer<T>& x,
687 sycl::buffer<T>& y)
688 {
689
690 auto& w = getWorkBuffer<T>(x.size());
691
692 // clang-format off
693 m_env->internal()->queue().submit( [&](sycl::handler& cgh)
694 {
695 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
696 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
697 //auto access_w = w.template get_access<sycl::access::mode::write>(cgh);
698 auto access_w = sycl::accessor { w, cgh, sycl::write_only, sycl::property::no_init{}};
699 //auto access_w = w.get_access(cgh,sycl::write_only, sycl::property::no_init{});
700 auto y_length = y.size() ;
701 cgh.parallel_for<class vector_dot>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
702 {
703 auto id = itemId.get_id(0);
704 for (auto i = id; i < y_length; i += itemId.get_range()[0])
705 access_w[i] = access_x[i]*access_y[i];
706 });
707 });
708 // clang-format on
709
710 std::size_t local = m_max_work_group_size;
711 std::size_t length = x.size();
712
713 int level = 0;
714 {
715 /* Each iteration of the do loop applies one level of reduction until
716 * the input is of length 1 (i.e. the reduction is complete). */
717 do {
718 auto round_length = round_up(length, local);
719 // clang-format off
720 auto f = [length, round_length, local, &w](sycl::handler& h) mutable
721 {
722 sycl::nd_range<1> range{sycl::range<1>{round_length},
723 sycl::range<1>{local}};
724 auto access_w = w.template get_access<sycl::access::mode::read_write>(h);
725
726 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
727 // scratch(sycl::range<1>(local), h);
728 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
729
730 /* The parallel_for invocation chosen is the variant with an nd_item
731 * parameter, since the code requires barriers for correctness. */
732 h.parallel_for<class sycl_reduction_sum_T>(range,
733 [access_w, scratch, local, length](sycl::nd_item<1> id)
734 {
735 std::size_t globalid = id.get_global_id(0);
736 std::size_t localid = id.get_local_id(0);
737
738 /* All threads collectively read from global memory into local.
739 * The barrier ensures all threads' IO is resolved before
740 * execution continues (strictly speaking, all threads within
741 * a single work-group - there is no co-ordination between
742 * work-groups, only work-items). */
743 scratch[localid] = (globalid < length)? access_w[globalid] : 0. ;
744 id.barrier(sycl::access::fence_space::local_space);
745
746 /* Apply the reduction operation between the current local
747 * id and the one on the other half of the vector. */
748 if (globalid < length)
749 {
750 //int min = (length < local) ? length : local;
751 std::size_t min = local ;
752 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
753 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
754 {
755 if (localid < offset)
756 {
757 scratch[localid] += scratch[localid + offset];
758 }
759 id.barrier(sycl::access::fence_space::local_space);
760 }
761 /* The final result will be stored in local id 0. */
762 if (localid == 0)
763 {
764 access_w[id.get_group(0)] = scratch[localid];
765 }
766 }
767 });
768 };
769 // clang-format on
770 m_env->internal()->queue().submit(f);
771 /* At this point, you could queue::wait_and_throw() to ensure that
772 * errors are caught quickly. However, this would likely impact
773 * performance negatively. */
774 length = (length + local - 1) / local;
775 ++level;
776 } while (length > 1);
777 }
778 //auto h_w = w.template get_access<sycl::access::mode::read>();
779 auto h_w = w.get_host_access();
780 T sum = h_w[0];
781 return sum;
782 }
783
784 template <typename T>
786
787 template <typename T>
789
790 template <typename T>
791 T map_reduce_sum(sycl::buffer<T>& x,
792 sycl::buffer<T>& y)
793 {
794 auto& w = getWorkBuffer<T>(x.size());
795
796 std::size_t local = m_max_work_group_size;
797 std::size_t length = x.size();
798
799 int level = 0;
800 {
801 /* Each iteration of the do loop applies one level of reduction until
802 * the input is of length 1 (i.e. the reduction is complete). */
803 //do
804 {
805 auto round_length = round_up(length, local);
806 // clang-format off
807 auto f0 = [length, round_length, local, &x,&y, &w](sycl::handler& h) mutable
808 {
809 sycl::nd_range<1> range{sycl::range<1>{round_length},
810 sycl::range<1>{local}};
811 auto access_x = x.template get_access<sycl::access::mode::read>(h);
812 auto access_y = y.template get_access<sycl::access::mode::read>(h);
813 //auto access_w = w.template get_access<sycl::access::mode::read_write>(h);
814 auto access_w = sycl::accessor { w, h, sycl::read_write, sycl::property::no_init{}};
815
816 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
817 // scratch(sycl::range<1>(local), h);
818 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
819
820 /* The parallel_for invocation chosen is the variant with an nd_item
821 * parameter, since the code requires barriers for correctness. */
822 h.parallel_for<class sycl_map_reduction_sum0_T>(range,
823 [access_x,access_y,access_w, scratch, local, length](sycl::nd_item<1> id)
824 {
825 std::size_t globalid = id.get_global_id(0);
826 std::size_t localid = id.get_local_id(0);
827
828 /* All threads collectively read from global memory into local.
829 * The barrier ensures all threads' IO is resolved before
830 * execution continues (strictly speaking, all threads within
831 * a single work-group - there is no co-ordination between
832 * work-groups, only work-items). */
833 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
834
835 id.barrier(sycl::access::fence_space::local_space);
836
837 /* Apply the reduction operation between the current local
838 * id and the one on the other half of the vector. */
839 if (globalid < length)
840 {
841 //int min = (length < local) ? length : local;
842 std::size_t min = local ;
843 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
844 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
845 {
846 if (localid < offset)
847 {
848 scratch[localid] += scratch[localid + offset];
849 }
850 id.barrier(sycl::access::fence_space::local_space);
851 }
852 /* The final result will be stored in local id 0. */
853 if (localid == 0)
854 {
855 access_w[id.get_group(0)] = scratch[localid];
856 }
857 }
858 });
859 };
860 // clang-format on
861
862 // clang-format off
863 auto f1 = [length, round_length, local, &w](sycl::handler& h) mutable
864 {
865 sycl::nd_range<1> range{sycl::range<1>{round_length},
866 sycl::range<1>{local}};
867 auto access_w = w.template get_access<sycl::access::mode::read_write>(h);
868
869 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
870 // scratch(sycl::range<1>(local), h);
871 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
872
873 /* The parallel_for invocation chosen is the variant with an nd_item
874 * parameter, since the code requires barriers for correctness. */
875 h.parallel_for<class sycl_map_reduction_sum_T>(range,
876 [access_w, scratch, local, length](sycl::nd_item<1> id)
877 {
878 std::size_t globalid = id.get_global_id(0);
879 std::size_t localid = id.get_local_id(0);
880
881 /* All threads collectively read from global memory into local.
882 * The barrier ensures all threads' IO is resolved before
883 * execution continues (strictly speaking, all threads within
884 * a single work-group - there is no co-ordination between
885 * work-groups, only work-items). */
886 scratch[localid] = (globalid < length)? access_w[globalid] : 0. ;
887 id.barrier(sycl::access::fence_space::local_space);
888
889 /* Apply the reduction operation between the current local
890 * id and the one on the other half of the vector. */
891 if (globalid < length)
892 {
893 //int min = (length < local) ? length : local;
894 std::size_t min = local ;
895 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
896 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
897 {
898 if (localid < offset)
899 {
900 scratch[localid] += scratch[localid + offset];
901 }
902 id.barrier(sycl::access::fence_space::local_space);
903 }
904 /* The final result will be stored in local id 0. */
905 if (localid == 0)
906 {
907 access_w[id.get_group(0)] = scratch[localid];
908 }
909 }
910 });
911 };
912 // clang-format on
913 if (level == 0)
914 m_env->internal()->queue().submit(f0);
915 else
916 m_env->internal()->queue().submit(f1);
917 /* At this point, you could queue::wait_and_throw() to ensure that
918 * errors are caught quickly. However, this would likely impact
919 * performance negatively. */
920 length = (length + local - 1) / local;
921 ++level;
922 } //while (length > 1);
923 }
924 auto h_x = w.get_host_access();
925 //T sum = h_x[0] ;
926 T sum = 0;
927 for (std::size_t i = 0; i < length; ++i)
928 sum += h_x[i];
929 return sum;
930 }
931
932 template <typename T>
934
935 template <typename T>
936 T map2_reduce_sum(sycl::buffer<T>& x,
937 sycl::buffer<T>& y)
938 {
939 std::size_t local = m_max_work_group_size;
940 std::size_t total_threads = m_total_threads;
941 std::size_t length = x.size();
942
943 T sum_init = 0;
944 sycl::buffer<T> sum{ &sum_init, 1 };
945
946 {
947 /* Each iteration of the do loop applies one level of reduction until
948 * the input is of length 1 (i.e. the reduction is complete). */
949 {
950 auto round_length = round_up(length, local);
951 // clang-format off
952 auto f0 = [length, round_length,total_threads, local, &x, &y, &sum](sycl::handler& h) mutable
953 {
954 sycl::nd_range<1> range{sycl::range<1>{std::min(total_threads,round_length)},
955 sycl::range<1>{local}};
956 auto access_x = x.template get_access<sycl::access::mode::read>(h);
957 auto access_y = y.template get_access<sycl::access::mode::read>(h);
958
959 sycl::accessor access_sum {sum, h};
960#ifdef USE_HIPSYCL
961 auto sumReduction = sycl::reduction(access_sum, sycl::plus<T>());
962#endif
963#ifdef USE_ONEAPI
964 auto sumReduction = sycl::reduction(sum, h, sycl::plus<T>());
965#endif
966#ifdef USE_ACPPSYCL
967 auto sumReduction = sycl::reduction(sum, h, sycl::plus<T>());
968#endif
969
970 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
971 // scratch(sycl::range<1>(local), h);
972 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
973
974 /* The parallel_for invocation chosen is the variant with an nd_item
975 * parameter, since the code requires barriers for correctness. */
976 h.parallel_for<class sycl_map2_reduction_sum0_T>(range,
977 sumReduction,
978 [access_x,access_y, scratch, local,total_threads,length](sycl::nd_item<1> id, auto &sum)
979 {
980 std::size_t globalid = id.get_global_id(0);
981 std::size_t localid = id.get_local_id(0);
982
983 /* All threads collectively read from global memory into local.
984 * The barrier ensures all threads' IO is resolved before
985 * execution continues (strictly speaking, all threads within
986 * a single work-group - there is no co-ordination between
987 * work-groups, only work-items). */
988 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
989 for (auto i = globalid+total_threads; i < length; i += total_threads)
990 scratch[localid] += access_x[i]*access_y[i];
991
992 id.barrier(sycl::access::fence_space::local_space);
993
994 /* Apply the reduction operation between the current local
995 * id and the one on the other half of the vector. */
996 //if (globalid < length)
997 {
998 //int min = (length < local) ? length : local;
999 std::size_t min = local ;
1000 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1001 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
1002 {
1003 if (localid < offset)
1004 {
1005 scratch[localid] += scratch[localid + offset];
1006 }
1007 id.barrier(sycl::access::fence_space::local_space);
1008 }
1009 /* The final result will be stored in local id 0. */
1010 if (localid == 0)
1011 {
1012 //access_w[id.get_group(0)] = scratch[localid];
1013 sum += scratch[localid];
1014 }
1015 }
1016 });
1017 };
1018 // clang-format on
1019 m_env->internal()->queue().submit(f0);
1020 }
1021 }
1022 auto h_sum = sum.get_host_access();
1023 return h_sum[0];
1024 }
1025
1026 template <typename T>
1028
1029 template <typename T>
1030 T map3_reduce_sum(sycl::buffer<T>& x,
1031 sycl::buffer<T>& y)
1032 {
1033 std::size_t local = m_max_work_group_size;
1034 std::size_t total_threads = m_total_threads;
1035 std::size_t length = x.size();
1036
1037 //std::vector<T> group_sum(m_max_num_groups) ;
1038 //sycl::buffer<T> group_sum{ m_max_num_groups };
1039 auto& group_sum = getWorkBuffer<T>(m_max_num_groups);
1040
1041 //T sum_init = 0;
1042 //sycl::buffer<T> sum{ &sum_init, 1 };
1043
1044 auto round_length = round_up(length, local);
1045 {
1046 /* Each iteration of the do loop applies one level of reduction until
1047 * the input is of length 1 (i.e. the reduction is complete). */
1048 //do
1049 {
1050 // clang-format off
1051 auto f0 = [total_threads,round_length, length, local, &x, &y,&group_sum](sycl::handler& h) mutable
1052 {
1053 sycl::nd_range<1> range{sycl::range<1>{std::min(total_threads,round_length)},
1054 sycl::range<1>{local}};
1055 auto access_x = x.template get_access<sycl::access::mode::read>(h);
1056 auto access_y = y.template get_access<sycl::access::mode::read>(h);
1057
1058 auto access_sum = sycl::accessor { group_sum, h, sycl::read_write, sycl::property::no_init{}};
1059
1060 //sycl::accessor access_sum {sum, h};
1061 //auto sumReduction = sycl::reduction(access_sum, sycl::plus<T>());
1062
1063 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
1064 // scratch(sycl::range<1>(local), h);
1065 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1066
1067 /* The parallel_for invocation chosen is the variant with an nd_item
1068 * parameter, since the code requires barriers for correctness. */
1069
1070 h.parallel_for<class sycl_map3_reduction_sum0_T>(range,
1071 //sumReduction,
1072 //[access_x,access_y,scratch,local,length,total_threads] (sycl::nd_item<1> id, auto& sum)
1073 [access_x,access_y,access_sum,scratch,local,length,total_threads] (sycl::nd_item<1> id)
1074 {
1075 std::size_t globalid = id.get_global_id(0);
1076 std::size_t localid = id.get_local_id(0);
1077
1078 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
1079
1080 for (auto i = globalid+total_threads; i < length; i += total_threads)
1081 scratch[localid] += access_x[i]*access_y[i];
1082
1083 id.barrier(sycl::access::fence_space::local_space);
1084
1085 /* Apply the reduction operation between the current local
1086 * id and the one on the other half of the vector. */
1087 {
1088 //int min = (length < local) ? length : local;
1089 std::size_t min = local ;
1090 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1091 {
1092 if (localid < offset)
1093 {
1094 scratch[localid] += scratch[localid + offset];
1095 }
1096 id.barrier(sycl::access::fence_space::local_space);
1097 }
1098 }
1099 /* The final result will be stored in local id 0. */
1100 if (localid == 0)
1101 {
1102 access_sum[id.get_group(0)] = scratch[localid];
1103 //sum += scratch[localid] ;
1104 }
1105 });
1106
1107 };
1108 // clang-format on
1109 m_env->internal()->queue().submit(f0);
1110
1111 } //while (length > 1);
1112 }
1113
1114 //return sum.get_host_access()[0];
1115 auto h_sum = group_sum.get_host_access();
1116 T sum = 0;
1117 for (std::size_t i = 0; i < std::min(total_threads, round_length) / local; ++i)
1118 sum += h_sum[i];
1119 return sum;
1120 }
1121
1122 template <typename T>
1124
1125 template <typename T>
1126 void asynch_map4_reduce_sum(sycl::buffer<T>& x,
1127 sycl::buffer<T>& y,
1128 sycl::buffer<T>& res,
1129 sycl::event& event)
1130 {
1131 std::size_t local = m_max_work_group_size;
1132 std::size_t total_threads = m_total_threads;
1133 std::size_t length = x.size();
1134
1135 //T sum_init = 0 ;
1136 //sycl::buffer<T> sum{&sum_init,1};
1137 {
1138 /* Each iteration of the do loop applies one level of reduction until
1139 * the input is of length 1 (i.e. the reduction is complete). */
1140 //do
1141 {
1142 auto round_length = round_up(length, local);
1143 // clang-format off
1144 auto f0 = [total_threads, round_length, length, local, &x, &y,&res](sycl::handler& h) mutable
1145 {
1146 sycl::nd_range<1> range{sycl::range<1>{std::min(total_threads,round_length)},
1147 sycl::range<1>{local}};
1148 auto access_x = x.template get_access<sycl::access::mode::read>(h);
1149 auto access_y = y.template get_access<sycl::access::mode::read>(h);
1150#ifdef USE_HIPSYCL
1151 sycl::accessor access_sum {res, h};
1152 auto sumReduction = sycl::reduction(access_sum, sycl::plus<T>());
1153#endif
1154#ifdef USE_ONEAPI
1155 auto sumReduction = sycl::reduction(res, h, sycl::plus<T>());
1156#endif
1157#ifdef USE_ACPPSYCL
1158 auto sumReduction = sycl::reduction(res, h, sycl::plus<T>());
1159#endif
1160 //auto access_sum = sycl::accessor<T,0,access::mode::write,access::target::global_buffer>(res, h);
1161 //auto access_sum = sycl::accessor { res, h, sycl::read_write, sycl::property::no_init{}};
1162
1163 //auto sumReduction = sycl::reduction(access_sum,sycl::plus<T>(), sycl::property::reduction::initialize_to_identity);
1164 //auto sumReduction = sycl::reduction(access_sum,sycl::plus<T>());
1165
1166 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
1167 // scratch(sycl::range<1>(local), h);
1168 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1169
1170 /* The parallel_for invocation chosen is the variant with an nd_item
1171 * parameter, since the code requires barriers for correctness. */
1172
1173 h.parallel_for<class map4_reduction_sum_T>(range,
1174 sumReduction,
1175 [access_x,access_y,scratch,local,length,total_threads] (sycl::nd_item<1> id, auto& sum)
1176 {
1177 std::size_t globalid = id.get_global_id(0);
1178 std::size_t localid = id.get_local_id(0);
1179
1180 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
1181
1182 for (auto i = globalid+total_threads; i < length; i += total_threads)
1183 scratch[localid] += access_x[i]*access_y[i];
1184
1185 id.barrier(sycl::access::fence_space::local_space);
1186
1187 /* Apply the reduction operation between the current local
1188 * id and the one on the other half of the vector. */
1189 //if (globalid < length)
1190 {
1191 //int min = (length < local) ? length : local;
1192 std::size_t min = local ;
1193 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1194 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
1195 {
1196 if (localid < offset)
1197 {
1198 scratch[localid] += scratch[localid + offset];
1199 }
1200 id.barrier(sycl::access::fence_space::local_space);
1201 }
1202 }
1203 /* The final result will be stored in local id 0. */
1204 if (localid == 0)
1205 {
1206 //access_sum[id.get_group(0)] = scratch[localid];
1207 sum += scratch[localid] ;
1208 }
1209 });
1210
1211 };
1212 // clang-format on
1213 event = m_env->internal()->queue().submit(f0);
1214 }
1215 }
1216 }
1217
1218 template <typename T>
1219 T end_map4_reduce_sum(sycl::event& event,
1220 sycl::buffer<T>& res,
1221 std::size_t num_blocks)
1222 {
1223 event.wait() ;
1224 auto h_access = res.get_host_access();
1225 return h_access[0];
1226 }
1227
1228 template <typename T>
1230
1231 template <typename T>
1233
1234 template <typename T>
1235 void asynch_map5_reduce_sum(sycl::buffer<T>& x,
1236 sycl::buffer<T>& y,
1237 sycl::buffer<T>& res,
1238 sycl::event& event)
1239 {
1240 std::size_t local = m_max_work_group_size;
1241 std::size_t total_threads = m_total_threads;
1242 std::size_t length = x.size();
1243
1244 int level = 0;
1245 {
1246 /* Each iteration of the do loop applies one level of reduction until
1247 * the input is of length 1 (i.e. the reduction is complete). */
1248 do {
1249 // clang-format off
1250 auto f0 = [total_threads, length, local, &x, &y,&res](sycl::handler& h) mutable
1251 {
1252 sycl::nd_range<1> range{sycl::range<1>{total_threads},
1253 sycl::range<1>{local}};
1254 auto access_x = x.template get_access<sycl::access::mode::read>(h);
1255 auto access_y = y.template get_access<sycl::access::mode::read>(h);
1256 auto access_sum = sycl::accessor { res, h, sycl::read_write, sycl::property::no_init{}};
1257
1258 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
1259 // scratch(sycl::range<1>(local), h);
1260 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1261
1262 /* The parallel_for invocation chosen is the variant with an nd_item
1263 * parameter, since the code requires barriers for correctness. */
1264
1265 h.parallel_for<class map5_reduction_sum_T>(range,
1266 [access_x,access_y,access_sum,scratch,local,length,total_threads] (sycl::nd_item<1> id)
1267 {
1268 std::size_t globalid = id.get_global_id(0);
1269 std::size_t localid = id.get_local_id(0);
1270
1271 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
1272
1273 for (auto i = globalid+total_threads; i < length; i += total_threads)
1274 scratch[localid] += access_x[i]*access_y[i];
1275
1276 id.barrier(sycl::access::fence_space::local_space);
1277
1278 /* Apply the reduction operation between the current local
1279 * id and the one on the other half of the vector. */
1280 if (globalid < length)
1281 {
1282 //int min = (length < local) ? length : local;
1283 for (std::size_t offset = local / 2; offset > 0; offset /= 2)
1284 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
1285 {
1286 if (localid < offset)
1287 {
1288 scratch[localid] += scratch[localid + offset];
1289 }
1290 id.barrier(sycl::access::fence_space::local_space);
1291 }
1292 }
1293 /* The final result will be stored in local id 0. */
1294 if (localid == 0)
1295 {
1296 access_sum[id.get_group(0)] = scratch[localid];
1297 }
1298
1299 });
1300
1301 };
1302 // clang-format on
1303 auto f1 = [length, local, &res](sycl::handler& h) mutable {
1304 sycl::nd_range<1> range{ sycl::range<1>{ local },
1305 sycl::range<1>{ local } };
1306 auto access_sum = res.template get_access<sycl::access::mode::read_write>(h);
1307
1308 //sycl::accessor<T, 1, sycl::access::mode::read_write, sycl::access::target::local>
1309 //scratch(sycl::range<1>(local), h);
1310 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1311
1312 /* The parallel_for invocation chosen is the variant with an nd_item
1313 * parameter, since the code requires barriers for correctness. */
1314 h.parallel_for<class map5_reduction_sum1_T>(range,
1315 [access_sum, scratch, local, length](sycl::nd_item<1> id) {
1316 //auto localid = id.get_id(0);
1317 std::size_t globalid = id.get_global_id(0);
1318 std::size_t localid = id.get_local_id(0);
1319
1320 /* All threads collectively read from global memory into local.
1321 * The barrier ensures all threads' IO is resolved before
1322 * execution continues (strictly speaking, all threads within
1323 * a single work-group - there is no co-ordination between
1324 * work-groups, only work-items). */
1325 scratch[localid] = (localid < length) ? access_sum[localid] : 0.;
1326 id.barrier(sycl::access::fence_space::local_space);
1327
1328 /* Apply the reduction operation between the current local
1329 * id and the one on the other half of the vector. */
1330 if (localid < length) {
1331 //int min = (length < local) ? length : local;
1332 std::size_t min = local;
1333 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1334 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
1335 {
1336 if (localid < offset) {
1337 scratch[localid] += scratch[localid + offset];
1338 }
1339 id.barrier(sycl::access::fence_space::local_space);
1340 }
1341 /* The final result will be stored in local id 0. */
1342 if (localid == 0) {
1343 access_sum[0] = scratch[localid];
1344 }
1345 }
1346 });
1347 };
1348 // clang-format on
1349 if (level == 0)
1350 event = m_env->internal()->queue().submit(f0);
1351 else
1352 event = m_env->internal()->queue().submit(f1);
1353
1354 /* At this point, you could queue::wait_and_throw() to ensure that
1355 * errors are caught quickly. However, this would likely impact
1356 * performance negatively. */
1357 //length = (length + local - 1) / local;
1358 length = (std::min(total_threads, length) + local - 1) / local;
1359 ++level;
1360 } while (length > 1);
1361 }
1362 //auto h_x = res.template get_access<sycl::access::mode::read>();
1363 //T sum = h_x[0] ;
1364 //T sum = 0;
1365 //for (int i = 0; i < length; ++i)
1366 // sum += h_x[i];
1367 //return sum;
1368 }
1369
1370
1371 template <typename T>
1372 T end_map5_reduce_sum(sycl::event& event,
1373 sycl::buffer<T>& res,
1374 std::size_t num_blocks)
1375 {
1376 event.wait() ;
1377 auto h_access = res.get_host_access();
1378 return h_access[0];
1379 }
1380
1381 template <typename T>
1382 T reduce_sum2(const std::vector<T>& x)
1383 {
1384 T value = 0;
1385 //for( auto v : x)
1386 // value += v ;
1387 //return value ;
1388
1389 {
1390 auto num_groups = m_env->internal()->queue().get_device().get_info<sycl::info::device::max_compute_units>();
1391 // getting the maximum work group size per thread
1392 auto work_group_size = m_env->internal()->queue().get_device().get_info<sycl::info::device::max_work_group_size>();
1393 // building the best number of global thread
1394 auto total_threads = num_groups * work_group_size;
1395
1396 //std::cout << "LOCAL =" << work_group_size << std::endl;
1397 //std::cout << "NB GROUPS =" << num_groups << std::endl;
1398 //std::cout << "NB THREADS=" << total_threads << std::endl;
1399
1400 /* The buffer is used to initialise the data on the device, but we don't
1401 * want to copy back and trash it. buffer::set_final_data() tells the
1402 * SYCL runtime where to put the data when the buffer is destroyed; nullptr
1403 * indicates not to copy back. The vector's length is used as the global
1404 * work size (unless that is too large). */
1405 auto device = m_env->internal()->queue().get_device();
1406 //std::size_t local = std::min(x.size(),device.get_info<sycl::info::device::max_work_group_size>());
1407 std::size_t local = device.get_info<sycl::info::device::max_work_group_size>();
1408
1409 std::size_t length = x.size();
1410
1411 sycl::buffer<T, 1> xbuf(x.data(), sycl::range<1>(x.size()));
1412 xbuf.set_final_data(nullptr);
1413
1414 int level = 0;
1415 {
1416 /* Each iteration of the do loop applies one level of reduction until
1417 * the input is of length 1 (i.e. the reduction is complete). */
1418 do {
1419
1420 auto round_length = round_up(length, local);
1421 std::cout << "LENGTH :" << level << " " << length << " " << round_length << " " << local << std::endl;
1422 // clang-format off
1423 auto f = [length,round_length,local, &xbuf](sycl::handler& h) mutable
1424 {
1425 //sycl::nd_range<1> r{sycl::range<1>{std::max(length, local)},
1426 // sycl::range<1>{std::min(length, local)}};
1427 sycl::nd_range<1> r{sycl::range<1>{round_length},
1428 sycl::range<1>{local}};
1429 /* Two accessors are used: one to the buffer that is being reduced,
1430 * and a second to local memory, used to store intermediate data. */
1431 auto x_access = xbuf.template get_access<sycl::access::mode::read_write>(h);
1432
1433 //sycl::accessor<T, 1, sycl::access::mode::read_write,sycl::access::target::local>
1434 // scratch(sycl::range<1>(local), h);
1435 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1436
1437 /* The parallel_for invocation chosen is the variant with an nd_item
1438 * parameter, since the code requires barriers for correctness. */
1439 h.parallel_for<class reduce_sum2>(r, [x_access, scratch, local, length](sycl::nd_item<1> id)
1440 {
1441 std::size_t globalid = id.get_global_id(0);
1442 std::size_t localid = id.get_local_id(0);
1443
1444 /* All threads collectively read from global memory into local.
1445 * The barrier ensures all threads' IO is resolved before
1446 * execution continues (strictly speaking, all threads within
1447 * a single work-group - there is no co-ordination between
1448 * work-groups, only work-items). */
1449 scratch[localid] = (globalid < length)? x_access[globalid] : 0. ;
1450 id.barrier(sycl::access::fence_space::local_space);
1451
1452 /* Apply the reduction operation between the current local
1453 * id and the one on the other half of the vector. */
1454 //#ifdef DEBUG
1455 if (globalid < length)
1456 {
1457 //int min = (length < local) ? length : local;
1458 int min = local ;
1459 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1460 //for (std::size_t offset = id.get_local_range(0) / 2; offset > 0; offset /= 2)
1461 {
1462 if (localid < offset)
1463 {
1464 scratch[localid] += scratch[localid + offset];
1465 }
1466 id.barrier(sycl::access::fence_space::local_space);
1467 }
1468 /* The final result will be stored in local id 0. */
1469 if (localid == 0)
1470 {
1471 x_access[id.get_group(0)] = scratch[localid];
1472 }
1473 }
1474 //#endif
1475 });
1476 };
1477 // clang-format on
1478 m_env->internal()->queue().submit(f);
1479 /* At this point, you could queue::wait_and_throw() to ensure that
1480 * errors are caught quickly. However, this would likely impact
1481 * performance negatively. */
1482 length = (length + local - 1) / local;
1483 std::cout << "AFTER LENGTH :" << level << " new length" << length << " " << local << std::endl;
1484 ++level;
1485 } while (length > 1);
1486 }
1487 /* It is always sensible to wrap host accessors in their own scope as
1488 * kernels using the buffers they access are blocked for the length
1489 * of the accessor's lifetime. */
1490 auto hI = xbuf.get_host_acces();
1491 value = hI[0];
1492 }
1493 //value = x[0] ;
1494 return value;
1495 }
1496
1497 template <typename T>
1498 T sycl_reduce_sum(sycl::buffer<T>& x,
1499 sycl::buffer<T>& y)
1500 {
1501 T sum_init = 0;
1502 sycl::buffer<T> sum_buff{ &sum_init, 1 };
1503
1504 // clang-format off
1505 m_env->internal()->queue().submit([&](sycl::handler &cgh)
1506 {
1507 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1508 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
1509#ifdef USE_HIPSYCL
1510 sycl::accessor sum_acc {sum_buff, cgh};
1511 auto sumReduction = sycl::reduction(sum_acc, sycl::plus<T>());
1512#endif
1513#ifdef USE_ONEAPI
1514 auto sumReduction = sycl::reduction(sum_buff, cgh, sycl::plus<T>());
1515#endif
1516#ifdef USE_ACPPSYCL
1517 auto sumReduction = sycl::reduction(sum_buff, cgh, sycl::plus<T>());
1518#endif
1519 cgh.parallel_for(sycl::range<1>{x.size()},
1520 sumReduction,
1521 [=](sycl::id<1> idx, auto &sum)
1522 {
1523 sum += access_x[idx]*access_y[idx];
1524 });
1525 });
1526 // clang-format on
1527
1528 return sum_buff.get_host_access()[0];
1529 }
1530
1531 /*
1532 namespace mi300 {
1533
1534 // Constantes tuned pour MI300 (gfx942)
1535 // - CU count : 228 CUs
1536 // - Wavefront : 64 threads (RDNA/CDNA)
1537 // - LDS/CU : 64 KB → 1024 threads × 8 B (double) = 8 KB/workgroup, safe
1538 inline constexpr std::size_t WG_SIZE = 1024; // 16 wavefronts/WG
1539 inline constexpr std::size_t ITEMS_PER_WI = 8; // unroll : chaque WI traite 8 éléments
1540
1541 // ---------------------------------------------------------------------------
1542 // dot_product_device
1543 // Calcule <x, y> = sum_i x[i]*y[i] sur GPU MI300.
1544 // x_buf, y_buf : buffers SYCL en lecture (taille >= n)
1545 // Retourne le scalaire double via un buffer résultat temporaire.
1546 // ---------------------------------------------------------------------------
1547 inline double dot_product(
1548 sycl::queue& q,
1549 sycl::buffer<double, 1>& x_buf,
1550 sycl::buffer<double, 1>& y_buf,
1551 std::size_t n)
1552 {
1553 if (n == 0) return 0.0;
1554
1555 // Nombre de work-groups : on arrondit au multiple de WG_SIZE*ITEMS_PER_WI
1556 const std::size_t stride = WG_SIZE * ITEMS_PER_WI;
1557 const std::size_t n_wg = (n + stride - 1) / stride;
1558 const std::size_t n_padded = n_wg * stride; // domaine de dispatch
1559
1560 // Buffer de réductions partielles (une valeur par WG)
1561 sycl::buffer<double, 1> partial_buf(n_wg);
1562
1563 // -------------------------------------------------------------------
1564 // Kernel 1 — réduction locale par work-group
1565 // Chaque WI charge ITEMS_PER_WI paires (x[i], y[i]) et accumule.
1566 // La réduction finale dans le WG utilise la LDS (sycl::local_accessor).
1567 // -------------------------------------------------------------------
1568 q.submit([&](sycl::handler& cgh) {
1569 auto x = x_buf.template get_access<sycl::access::mode::read>(cgh);
1570 auto y = y_buf.template get_access<sycl::access::mode::read>(cgh);
1571 auto out = partial_buf.template get_access<sycl::access::mode::discard_write>(cgh);
1572
1573 // LDS : WG_SIZE doubles par work-group
1574 sycl::local_accessor<double, 1> local_sum(WG_SIZE, cgh);
1575
1576 sycl::nd_range<1> nd_range{n_padded, WG_SIZE};
1577
1578 cgh.parallel_for(nd_range, [=](sycl::nd_item<1> item) {
1579 const std::size_t gid = item.get_global_id(0);
1580 const std::size_t lid = item.get_local_id(0);
1581 const std::size_t wg_id = item.get_group(0);
1582 const std::size_t base = wg_id * stride;
1583
1584 // Accumulation privée — ITEMS_PER_WI éléments par WI
1585 double acc = 0.0;
1586 #pragma unroll
1587 for (std::size_t k = 0; k < ITEMS_PER_WI; ++k) {
1588 const std::size_t idx = base + lid + k * WG_SIZE;
1589 if (idx < n) {
1590 acc += x[idx] * y[idx];
1591 }
1592 }
1593
1594 // Dépôt en LDS
1595 local_sum[lid] = acc;
1596 sycl::group_barrier(item.get_group());
1597
1598 // Réduction en arbre dans la LDS (log2(WG_SIZE) = 10 passes)
1599 for (std::size_t offset = WG_SIZE / 2; offset > 0; offset >>= 1) {
1600 if (lid < offset) {
1601 local_sum[lid] += local_sum[lid + offset];
1602 }
1603 sycl::group_barrier(item.get_group());
1604 }
1605
1606 // WI 0 écrit la somme partielle du WG
1607 if (lid == 0) {
1608 out[wg_id] = local_sum[0];
1609 }
1610 });
1611 });
1612
1613 // -------------------------------------------------------------------
1614 // Kernel 2 — réduction finale des n_wg partielles
1615 // Si n_wg <= WG_SIZE on fait tout en un seul WG.
1616 // Sinon on rappelle dot_product récursivement (rare pour n < 2^30).
1617 // -------------------------------------------------------------------
1618 double result = 0.0;
1619
1620 if (n_wg <= WG_SIZE) {
1621 sycl::buffer<double, 1> result_buf(&result, 1);
1622
1623 q.submit([&](sycl::handler& cgh) {
1624 auto src = partial_buf.template get_access<sycl::access::mode::read>(cgh);
1625 auto dst = result_buf.template get_access<sycl::access::mode::discard_write>(cgh);
1626 sycl::local_accessor<double, 1> lmem(WG_SIZE, cgh);
1627
1628 // On dispatch exactement WG_SIZE threads, padding avec 0
1629 cgh.parallel_for(sycl::nd_range<1>{WG_SIZE, WG_SIZE},
1630 [=](sycl::nd_item<1> item) {
1631 const std::size_t lid = item.get_local_id(0);
1632 lmem[lid] = (lid < n_wg) ? src[lid] : 0.0;
1633 sycl::group_barrier(item.get_group());
1634
1635 for (std::size_t offset = WG_SIZE / 2; offset > 0; offset >>= 1) {
1636 if (lid < offset)
1637 lmem[lid] += lmem[lid + offset];
1638 sycl::group_barrier(item.get_group());
1639 }
1640 if (lid == 0) dst[0] = lmem[0];
1641 });
1642 });
1643 // L'accès au buffer result_buf en sortie de scope force la synchronisation
1644 } else {
1645 // Chemin récursif (n > WG_SIZE² ~ 1M WGs, soit n > ~8G éléments)
1646 // Rare en pratique — on réduit les partielles via un second appel
1647 result = dot_product(q, partial_buf, partial_buf, n_wg);
1648 }
1649
1650 return result;
1651 }
1652
1653 // ---------------------------------------------------------------------------
1654 // Variante asynchrone : retourne un sycl::event + résultat via buffer externe
1655 // Utile pour masquer la latence dans une boucle itérative (ex: CG)
1656 // ---------------------------------------------------------------------------
1657 inline sycl::event dot_product_async(
1658 sycl::queue& q,
1659 sycl::buffer<double, 1>& x_buf,
1660 sycl::buffer<double, 1>& y_buf,
1661 sycl::buffer<double, 1>& result_buf, // taille >= 1
1662 std::size_t n)
1663 {
1664 if (n == 0) {
1665 return q.submit([&](sycl::handler& cgh) {
1666 auto r = result_buf.template get_access<sycl::access::mode::discard_write>(cgh);
1667 cgh.single_task([=]{ r[0] = 0.0; });
1668 });
1669 }
1670
1671 const std::size_t stride = WG_SIZE * ITEMS_PER_WI;
1672 const std::size_t n_wg = (n + stride - 1) / stride;
1673 const std::size_t n_padded = n_wg * stride;
1674
1675 // Shared ptr pour que le buffer partiel survive au scope du lambda
1676 auto partial = std::make_shared<sycl::buffer<double,1>>(n_wg);
1677
1678 q.submit([&, partial](sycl::handler& cgh) {
1679 auto x = x_buf.template get_access<sycl::access::mode::read>(cgh);
1680 auto y = y_buf.template get_access<sycl::access::mode::read>(cgh);
1681 auto out = partial->template get_access<sycl::access::mode::discard_write>(cgh);
1682 sycl::local_accessor<double, 1> local_sum(WG_SIZE, cgh);
1683
1684 cgh.parallel_for(sycl::nd_range<1>{n_padded, WG_SIZE},
1685 [=](sycl::nd_item<1> item) {
1686 const std::size_t lid = item.get_local_id(0);
1687 const std::size_t base = item.get_group(0) * stride;
1688 double acc = 0.0;
1689 #pragma unroll
1690 for (std::size_t k = 0; k < ITEMS_PER_WI; ++k) {
1691 const std::size_t idx = base + lid + k * WG_SIZE;
1692 if (idx < n) acc += x[idx] * y[idx];
1693 }
1694 local_sum[lid] = acc;
1695 sycl::group_barrier(item.get_group());
1696 for (std::size_t offset = WG_SIZE / 2; offset > 0; offset >>= 1) {
1697 if (lid < offset) local_sum[lid] += local_sum[lid + offset];
1698 sycl::group_barrier(item.get_group());
1699 }
1700 if (lid == 0) out[item.get_group(0)] = local_sum[0];
1701 });
1702 });
1703
1704 return q.submit([partial](sycl::handler& cgh) {
1705 auto src = partial->template get_access<sycl::access::mode::read>(cgh);
1706 // reduction via sycl::reduction (SYCL 2020)
1707 // Note : result_buf doit être capturé par référence externe
1708 // → on utilise un single_task pour la somme finale des partielles (n_wg petit)
1709 cgh.single_task([=](){
1710 // accessible depuis le lambda — n_wg connu via closure
1711 });
1712 });
1713 // NOTE : pour la variante async complète, préférez sycl::reduction (voir ci-dessous)
1714 }
1715 } */
1716
1717
1718 template <typename T>
1719 inline T dot_product_h100(sycl::buffer<T>& buf_x,
1720 sycl::buffer<T>& buf_y)
1721 {
1722 using namespace sycl;
1723 auto& q = m_env->internal()->queue();
1724 const size_t N = buf_x.size();
1725 assert(buf_y.size() == N);
1726
1727 auto dev = q.get_device();
1728 //const size_t num_sm = dev.get_info<info::device::max_compute_units>(); // 132
1729
1730 // Grid : TARGET_WAVES * num_sm blocs, chacun de WG_SIZE threads,
1731 // chaque thread traite ITEMS_PER_WI éléments → couverture totale
1732 const size_t blocks_needed = (N + WG_SIZE * ITEMS_PER_WI - 1)
1733 / (WG_SIZE * ITEMS_PER_WI);
1734 const size_t num_blocks = std::max(blocks_needed,
1735 m_max_num_groups * TARGET_WAVES);
1736 const size_t total_threads = num_blocks * WG_SIZE;
1737
1738 // Allocation USM pour les partiels (un T par bloc)
1739 //T* partials = malloc_device<T>(num_blocks, q);
1740 sycl::buffer<T> partials{num_blocks};
1741
1742 // ── Phase 1 : kernel de réduction par blocs ───────────────────────────
1743 q.submit(
1744 [&](handler& cgh)
1745 {
1746 auto ax = buf_x.template get_access<access::mode::read>(cgh);
1747 auto ay = buf_y.template get_access<access::mode::read>(cgh);
1748 auto ap = partials.template get_access<access::mode::read_write>(cgh);
1749 // LDS : 1 T par warp dans le bloc
1750 local_accessor<T,1> lds{WG_SIZE / WARP_SIZE, cgh};
1751
1752 //const T* px = ax.get_pointer(); // accès USM-like depuis accessor
1753 //const T* py = ay.get_pointer();
1754 //T* pp = partials;
1755 const size_t n = N;
1756
1757 cgh.parallel_for<class dot_h100_phase1>(
1758 nd_range<1>{{total_threads}, {WG_SIZE}},
1759 [=](nd_item<1> item)
1760 [[intel::reqd_sub_group_size(WARP_SIZE)]]
1761 {
1762 h100::dot_kernel_h100(item, ax, ay, ap, n, lds);
1763 });
1764 });
1765
1766 // ── Phase 2 : réduction finale des partiels sur le CPU ────────────────
1767 // Pour N = 100M Ts → num_blocks ≤ 2048 → 16 KB de partiels.
1768 // Un memcpy D2H de 16 KB est négligeable vs le kernel principal.
1769 // Alternative : second kernel de réduction si num_blocks > 10 000.
1770 //std::vector<T> h_partials(num_blocks);
1771 //q.memcpy(h_partials.data(), partials, num_blocks * sizeof(T)).wait();
1772 //free(partials, q);
1773 auto h_partials = partials.get_host_access();
1774
1775 // Kahan summation pour précision numérique sur la réduction finale
1776 T sum = 0.0, c = 0.0;
1777 for (size_t b = 0; b < num_blocks; ++b) {
1778 T z = h_partials[b] - c;
1779 T t = sum + z;
1780 c = (t - sum) - z;
1781 sum = t;
1782 }
1783 return sum;
1784 }
1785
1786 template <typename T>
1787 inline std::size_t asynch_dot_product_h100(sycl::buffer<T>& buf_x,
1788 sycl::buffer<T>& buf_y,
1789 sycl::buffer<T>& res,
1790 sycl::event& event)
1791 {
1792 using namespace sycl;
1793 auto& q = m_env->internal()->queue();
1794 const size_t N = buf_x.size();
1795 assert(buf_y.size() == N);
1796
1797 //auto dev = q.get_device();
1798 //const size_t num_sm = dev.get_info<info::device::max_compute_units>(); // 132
1799
1800 // Grid : TARGET_WAVES * num_sm blocs, chacun de WG_SIZE threads,
1801 // chaque thread traite ITEMS_PER_WI éléments → couverture totale
1802 const size_t blocks_needed = (N + WG_SIZE * ITEMS_PER_WI - 1)
1803 / (WG_SIZE * ITEMS_PER_WI);
1804 const size_t num_blocks = std::max(blocks_needed,m_max_num_groups * TARGET_WAVES);
1805 const size_t total_threads = num_blocks * WG_SIZE;
1806
1807 // Allocation USM pour les partiels (un T par bloc)
1808 //partials = malloc_device<T>(num_blocks, q);
1809
1810 // ── Phase 1 : kernel de réduction par blocs ───────────────────────────
1811 event = q.submit(
1812 [&](handler& cgh)
1813 {
1814 auto ax = buf_x.template get_access<access::mode::read>(cgh);
1815 auto ay = buf_y.template get_access<access::mode::read>(cgh);
1816 auto ap = res.template get_access<sycl::access::mode::read_write>(cgh);
1817 // LDS : 1 T par warp dans le bloc
1818 local_accessor<T,1> lds{WG_SIZE / WARP_SIZE, cgh};
1819
1820 //const T* px = ax.get_pointer(); // accès USM-like depuis accessor
1821 //const T* py = ay.get_pointer();
1822 //T* pp = ap.get_pointer();
1823 const size_t n = N;
1824
1825 cgh.parallel_for<class dot_h100_phase1>(
1826 nd_range<1>{{total_threads}, {WG_SIZE}},
1827 [=](nd_item<1> item)
1828 [[intel::reqd_sub_group_size(WARP_SIZE)]]
1829 {
1830 h100::dot_kernel_h100(item, ax, ay, ap, n, lds);
1831 });
1832 });
1833 return num_blocks;
1834 }
1835
1836
1837 template <typename T>
1838 inline T end_dot_product_h100(sycl::event& event,
1839 sycl::buffer<T>& res,
1840 std::size_t num_blocks)
1841 {
1842 event.wait() ;
1843 // ── Phase 2 : réduction finale des partiels sur le CPU ────────────────
1844 // Pour N = 100M Ts → num_blocks ≤ 2048 → 16 KB de partiels.
1845 // Un memcpy D2H de 16 KB est négligeable vs le kernel principal.
1846 // Alternative : second kernel de réduction si num_blocks > 10 000.
1847 //std::vector<T> h_partials(num_blocks);
1848 //m_env->internal()->queue().memcpy(h_partials.data(), partials, num_blocks * sizeof(T)).wait();
1849 //free(partials, q);
1850 auto h_partials = res.get_host_access();
1851
1852 // Kahan summation pour précision numérique sur la réduction finale
1853 T sum = 0.0, c = 0.0;
1854 for (size_t b = 0; b < num_blocks; ++b) {
1855 T z = h_partials[b] - c;
1856 T t = sum + z;
1857 c = (t - sum) - z;
1858 sum = t;
1859 }
1860 return sum;
1861 }
1862
1863 // ---------------------------------------------------------------------------
1864 // Variante moderne SYCL 2020 : sycl::reduction (plus concis, potentiellement
1865 // optimisé par le runtime AdaptiveCPP selon la cible)
1866 // ---------------------------------------------------------------------------
1867
1868 template<typename T>
1869 inline T dot_product_mi300(sycl::buffer<T, 1>& x_buf,
1870 sycl::buffer<T, 1>& y_buf)
1871 {
1872 using namespace mi300 ;
1873 std::cout<<" DOT PROD MI300 : "<<WG_SIZE<<" "<<ITEMS_PER_WI<<std::endl ;
1874
1875 bool use_doublebuf = false;
1876 auto& q = m_env->internal()->queue();
1877 /*
1878
1879 std::size_t n = x_buf.size() ;
1880 sycl::buffer<T, 1> result_buf(&result, 1);
1881
1882 const std::size_t stride = WG_SIZE * ITEMS_PER_WI;
1883 const std::size_t n_padded = ((n + stride - 1) / stride) * stride;
1884
1885 q.submit([&x_buf, &y_buf, &result_buf, n, n_padded, stride](sycl::handler& cgh) {
1886 auto x = x_buf.template get_access<sycl::access::mode::read>(cgh);
1887 auto y = y_buf.template get_access<sycl::access::mode::read>(cgh);
1888 auto r = result_buf.template get_access<sycl::access::mode::read_write>(cgh);
1889 //sycl::accessor r {result_buf, cgh};
1890 auto red = sycl::reduction(r, sycl::plus<T>{});
1891 cgh.parallel_for(
1892 sycl::nd_range<1>{n_padded, WG_SIZE}, red,
1893 [=](sycl::nd_item<1> item, auto& sum) {
1894 const std::size_t lid = item.get_local_id(0);
1895 const std::size_t base = item.get_group(0) * stride;
1896 T acc = 0.0;
1897 #pragma unroll
1898 for (std::size_t k = 0; k < ITEMS_PER_WI; ++k) {
1899 const std::size_t idx = base + lid + k * WG_SIZE;
1900 if (idx < n) acc += x[idx] * y[idx];
1901 }
1902 sum.combine(acc);
1903 });
1904 });
1905 */
1906
1907 std::size_t N = x_buf.size() ;
1908 // Grid : couvrir N en ITEMS_PER_WI éléments/WI
1909 const size_t blocks_needed = (N + WG_SIZE * ITEMS_PER_WI - 1)
1910 / (WG_SIZE * ITEMS_PER_WI);
1911 // Minimum : num_cu × TARGET_WAVES × (WG_SIZE/WAVEFRONT) blocs
1912 // = 228 × 4 × 4 = 3 648 — cohérent avec les profils rocprof
1913 const size_t num_blocks = std::max(blocks_needed,
1914 m_max_num_groups * TARGET_WAVES
1915 * (WG_SIZE / WAVEFRONT));
1916 const size_t total_threads = num_blocks * WG_SIZE;
1917 //std::cout<<" N = "<<N<<" "<<" num_blocks="<<num_blocks<<" nthreads="<<total_threads<<std::endl ;
1918 //double* partials = malloc_device<double>(num_blocks, q);
1919 sycl::buffer<T, 1> partials(num_blocks);
1920
1921 q.submit([&](sycl::handler& cgh)
1922 {
1923 auto ax = x_buf.template get_access<sycl::access::mode::read>(cgh);
1924 auto ay = y_buf.template get_access<sycl::access::mode::read>(cgh);
1925 auto ap = partials.template get_access<sycl::access::mode::read_write>(cgh);
1926 sycl::local_accessor<double,1> lds{N_WARPS_BLOC, cgh}; // 4 doubles = 32 B
1927
1928 //const double* px = ax.get_pointer();
1929 //const double* py = ay.get_pointer();
1930 //double* pp = partials;
1931 const size_t n = N;
1932 const bool db = use_doublebuf;
1933
1934 cgh.parallel_for<class dot_mi300_phase1>(
1935 sycl::nd_range<1>{{total_threads}, {WG_SIZE}},
1936 // ★ sub_group_size(64) — force WF=64 sur gfx942
1937 // Sans cette annotation, ACPP peut émettre WF=32
1938 // et la réduction sub_group ne couvrirait que 32 lanes
1939 [=](sycl::nd_item<1> item)
1940 [[intel::reqd_sub_group_size(WAVEFRONT)]]
1941 {
1942 if (db)
1943 mi300::dot_kernel_mi300_doublebuf(item, ax, ay, ap, n, lds);
1944 else
1945 mi300::dot_kernel_mi300(item, ax, ay, ap, n, lds);
1946 });
1947 }).wait();
1948
1949 // Phase 2 : réduction CPU des partiels
1950 //std::vector<ValueT> h_partials(num_blocks);
1951 //q.memcpy(h_partials.data(), partials, num_blocks * sizeof(double)).wait();
1952 //free(partials, q);
1953 auto h_partials = partials.get_host_access();
1954
1955 // Kahan summation — précision sur la somme des ~3 648 partiels
1956 T sum = 0, c = 0;
1957 for (std::size_t b = 0; b < num_blocks; ++b)
1958 {
1959 T z = h_partials[b] - c;
1960 double t = sum + z;
1961 c = (t - sum) - z;
1962 sum = t;
1963 }
1964 return sum;
1965 }
1966
1967 template <typename T>
1968 inline void asynch_dot_product_mi300(sycl::buffer<T, 1>& x_buf,
1969 sycl::buffer<T, 1>& y_buf,
1970 sycl::buffer<T, 1>& result_buf,
1971 sycl::event& event)
1972 {
1973 using namespace mi300 ;
1974
1975 std::size_t n = x_buf.size() ;
1976 auto& q = m_env->internal()->queue();
1977
1978 const std::size_t stride = WG_SIZE * ITEMS_PER_WI;
1979 const std::size_t n_padded = ((n + stride - 1) / stride) * stride;
1980
1981 event = q.submit([&](sycl::handler& cgh) {
1982 auto x = x_buf.template get_access<sycl::access::mode::read>(cgh);
1983 auto y = y_buf.template get_access<sycl::access::mode::read>(cgh);
1984 auto red = sycl::reduction(result_buf, cgh, sycl::plus<T>{});
1985
1986 cgh.parallel_for(
1987 sycl::nd_range<1>{n_padded, WG_SIZE}, red,
1988 [=](sycl::nd_item<1> item, auto& sum) {
1989 const std::size_t lid = item.get_local_id(0);
1990 const std::size_t base = item.get_group(0) * stride;
1991 T acc = 0.0;
1992 #pragma unroll
1993 for (std::size_t k = 0; k < ITEMS_PER_WI; ++k) {
1994 const std::size_t idx = base + lid + k * WG_SIZE;
1995 if (idx < n) acc += x[idx] * y[idx];
1996 }
1997 sum.combine(acc);
1998 });
1999 });
2000 }
2001
2002
2003
2004 template <typename T>
2005 T end_dot_product_mi300(sycl::event& event,
2006 sycl::buffer<T>& res,
2007 std::size_t num_blocks)
2008 {
2009 event.wait() ;
2010 auto h_access = res.get_host_access();
2011 return h_access[0];
2012 }
2013
2014 template <typename T>
2015 T dot(sycl::buffer<T>& x,
2016 sycl::buffer<T>& y)
2017 {
2018 switch (m_dot_algo) {
2019 case 0:
2020 return reduce_sum(x, y);
2021 case 1:
2022 return map_reduce_sum(x, y);
2023 case 2:
2024 return map2_reduce_sum(x, y); // with sycl_reduction
2025 case 3:
2026 return map3_reduce_sum(x, y);
2027 case 4: //H100
2028 return dot_product_h100(x, y);
2029 case 5: //MI300
2030 return dot_product_mi300(x, y);
2031 default:
2032 return sycl_reduce_sum(x, y);
2033 }
2034 }
2035
2036 template <typename T>
2037 void dot(sycl::buffer<T>& x,
2038 sycl::buffer<T>& y,
2039 Future<Real>& res)
2040{
2041 switch (m_dot_algo)
2042 {
2043 case 2:
2044 asynch_map4_reduce_sum(x, y, res.deviceValue(), res.event());
2045 res.setWaitFunction([=](sycl::event& event, sycl::buffer<double>& res, std::size_t num_blocks)
2046 {
2047 return this->end_map4_reduce_sum(event,res, num_blocks) ;
2048 }) ;
2049 break;
2050 case 4:
2051 {
2052 std::size_t num_blocks = asynch_dot_product_h100(x, y, res.deviceValue(), res.event());
2053 res.setWaitFunction([=](sycl::event& event, sycl::buffer<double>& res, std::size_t num_blocks)
2054 {
2055 return this->end_dot_product_h100(event, res, num_blocks) ;
2056 }) ;
2057 res.setNumBlocks(num_blocks) ;
2058 }
2059 break;
2060 case 5:
2061 asynch_dot_product_mi300(x, y, res.deviceValue(), res.event());
2062 res.setWaitFunction([=](sycl::event& event, sycl::buffer<T>& res, std::size_t num_blocks)
2063 {
2064 return this->end_dot_product_mi300(event,res, num_blocks) ;
2065 }) ;
2066 break;
2067 default:
2068 asynch_map5_reduce_sum(x, y, res.deviceValue(), res.event());
2069 res.setWaitFunction([=](sycl::event& event, sycl::buffer<T>& res, std::size_t num_blocks)
2070 {
2071 return this->end_map5_reduce_sum(event,res, num_blocks) ;
2072 }) ;
2073 break;
2074 }
2075 }
2076
2077 private:
2078 // clang-format off
2079 SYCLEnv* m_env = nullptr ;
2080 std::size_t m_max_num_groups = 0 ;
2081 std::size_t m_max_work_group_size = 0 ;
2082 std::size_t m_total_threads = 0 ;
2083 // clang-format on
2084
2085 template <typename T>
2086 sycl::buffer<T>& getWorkBuffer(std::size_t size);
2087
2088 mutable sycl::buffer<double>* m_double_work = nullptr;
2089};
2090
2091
2092/*---------------------------------------------------------------------------*/
2093
2094} // namespace Alien::SYCLInternal
2095
2096/*---------------------------------------------------------------------------*/