297 m_env = SYCLEnv::instance();
300 m_max_num_groups = m_env->maxNumGroups() ;
301 m_max_work_group_size = m_env->maxWorkGroupSize() ;
302 m_total_threads = m_env->maxNumThreads() ;
306 virtual ~KernelInternal() {}
308 void setDotAlgo(
int dot_algo)
310 m_dot_algo = dot_algo;
313 template <
typename T>
314 void assign(T
const a,
317 m_env->internal()->queue().submit(
318 [&](sycl::handler& cgh)
320 auto acc = y.template get_access<sycl::access::mode::discard_write>(cgh);
342 template <
typename T,
typename Lambda>
343 void apply(Lambda
const& lambda,
346 sycl::range<1> work_items{ m_total_threads };
349 m_env->internal()->queue().submit( [&](sycl::handler& cgh)
351 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
352 auto y_length = y.size() ;
353 cgh.parallel_for<
class vector_apply>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
355 auto id = itemId.get_id(0);
356 for (
auto i =
id; i < y_length; i += itemId.get_range()[0])
357 access_y[i] = lambda(i);
365 template <
typename T>
369 sycl::range<1> work_items{ m_total_threads };
372 m_env->internal()->queue().submit([&](sycl::handler& cgh)
374 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
375 auto y_length = y.size() ;
376 cgh.parallel_for<
class vector_scal>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
378 auto id = itemId.get_id(0);
379 for (
auto i =
id; i < y_length; i += itemId.get_range()[0])
380 access_y[i] = a*access_y[i];
387 template <
typename T>
392 sycl::range<1> work_items{ m_total_threads };
395 m_env->internal()->queue().submit([&](sycl::handler& cgh)
397 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
398 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
399 auto y_length = y.size() ;
400 cgh.parallel_for<
class vector_axpy>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
402 auto id = itemId.get_id(0);
403 for (
auto i =
id; i < y_length; i += itemId.get_range()[0])
404 access_y[i] += a * access_x[i];
411 template <
typename T>
415 using VecT = sycl::vec<T, 2>;
417 auto& queue = m_env->internal()->queue();
418 const size_t n = y.size();
419 const size_t n2 = n / 2;
420 const size_t tail = n % 2;
422 static constexpr size_t WG_SIZE = 256;
423 const size_t blocks = std::max((n2 + WG_SIZE - 1) / WG_SIZE,
424 m_max_num_groups * 4UL);
425 const size_t total = blocks * WG_SIZE;
430 sycl::buffer<VecT> y2{ y.template reinterpret<VecT>(sycl::range<1>{n2}) };
432 queue.submit([&](sycl::handler& cgh) {
433 auto ay = y2.template get_access<sycl::access::mode::read_write>(cgh);
435 cgh.parallel_for<
class vector_xcal_vec>(
436 sycl::nd_range<1>{ {total}, {WG_SIZE} },
437 [=](sycl::nd_item<1> item) {
438 const size_t stride = item.get_global_range()[0];
439 for (
size_t i = item.get_global_id(0); i < n2; i += stride) {
441 ay[i] = VecT{a, a} * ay[i];
448 sycl::buffer<T> yt{ y.template reinterpret<T>(sycl::range<1>{n}) };
449 queue.submit([&](sycl::handler& cgh) {
450 auto ay = yt.template get_access<sycl::access::mode::read_write>(cgh);
451 cgh.single_task([=]{ ay[n-1] = a * ay[n-1]; });
457 template <
typename T>
462 using VecT = sycl::vec<T, 2>;
464 auto& queue = m_env->internal()->queue();
465 const size_t n = y.size();
466 const size_t n2 = n / 2;
467 const size_t tail = n % 2;
469 static constexpr size_t WG_SIZE = 256;
470 const size_t blocks = std::max((n2 + WG_SIZE - 1) / WG_SIZE,
471 m_max_num_groups * 4UL);
472 const size_t total = blocks * WG_SIZE;
477 sycl::buffer<VecT> x2{ x.template reinterpret<VecT>(sycl::range<1>{n2}) };
478 sycl::buffer<VecT> y2{ y.template reinterpret<VecT>(sycl::range<1>{n2}) };
480 queue.submit([&](sycl::handler& cgh) {
481 auto ax = x2.template get_access<sycl::access::mode::read>(cgh);
482 auto ay = y2.template get_access<sycl::access::mode::read_write>(cgh);
484 cgh.parallel_for<
class vector_axpy_vec>(
485 sycl::nd_range<1>{ {total}, {WG_SIZE} },
486 [=](sycl::nd_item<1> item) {
487 const size_t stride = item.get_global_range()[0];
488 for (
size_t i = item.get_global_id(0); i < n2; i += stride) {
490 ay[i] = ay[i] + VecT{a, a} * ax[i];
497 sycl::buffer<T> xt{ x.template reinterpret<T>(sycl::range<1>{n}) };
498 sycl::buffer<T> yt{ y.template reinterpret<T>(sycl::range<1>{n}) };
499 queue.submit([&](sycl::handler& cgh) {
500 auto ax = xt.template get_access<sycl::access::mode::read>(cgh);
501 auto ay = yt.template get_access<sycl::access::mode::read_write>(cgh);
502 cgh.single_task([=]{ ay[n-1] += a * ax[n-1]; });
507 template <
typename T>
514 sycl::range<1> work_items{ m_total_threads };
517 m_env->internal()->queue().submit([&](sycl::handler& cgh)
519 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
520 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
521 auto x_length = x.size()/stride_x ;
522 cgh.parallel_for<
class vector_axpy>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
524 auto id = itemId.get_id(0);
525 for (
auto i =
id; i < x_length; i += itemId.get_range()[0])
526 access_y[i*stride_y] += a * access_x[i*stride_x];
532 template <
typename T>
533 void pointwiseMult(sycl::buffer<T>& x,
538 sycl::range<1> work_items{ m_total_threads };
541 m_env->internal()->queue().submit([&](sycl::handler& cgh)
543 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
544 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
545 auto access_z = z.template get_access<sycl::access::mode::read_write>(cgh);
546 auto y_length = y.size() ;
547 cgh.parallel_for<
class vector_pointwizemult>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
549 auto id = itemId.get_id(0);
550 for (
auto i =
id; i < y_length; i += itemId.get_range()[0])
551 access_z[i] = access_x[i] * access_y[i];
557 using VecT = sycl::vec<T, 2>;
559 auto& queue = m_env->internal()->queue();
560 const size_t n = y.size();
561 const size_t n2 = n / 2;
562 const size_t tail = n % 2;
564 static constexpr size_t WG_SIZE = 256;
565 const size_t blocks = std::max((n2 + WG_SIZE - 1) / WG_SIZE,
566 m_max_num_groups * 4UL);
567 const size_t total = blocks * WG_SIZE;
572 sycl::buffer<VecT> x2{ x.template reinterpret<VecT>(sycl::range<1>{n2}) };
573 sycl::buffer<VecT> y2{ y.template reinterpret<VecT>(sycl::range<1>{n2}) };
574 sycl::buffer<VecT> z2{ z.template reinterpret<VecT>(sycl::range<1>{n2}) };
576 queue.submit([&](sycl::handler& cgh) {
577 auto ax = x2.template get_access<sycl::access::mode::read>(cgh);
578 auto ay = y2.template get_access<sycl::access::mode::read>(cgh);
579 auto az = z2.template get_access<sycl::access::mode::read_write>(cgh);
581 cgh.parallel_for<
class vector_pointwizemult>(
582 sycl::nd_range<1>{ {total}, {WG_SIZE} },
583 [=](sycl::nd_item<1> item) {
584 const size_t stride = item.get_global_range()[0];
585 for (
size_t i = item.get_global_id(0); i < n2; i += stride) {
587 az[i] = ax[i] * ay[i];
594 sycl::buffer<T> xt{ x.template reinterpret<T>(sycl::range<1>{n}) };
595 sycl::buffer<T> yt{ y.template reinterpret<T>(sycl::range<1>{n}) };
596 sycl::buffer<T> zt{ z.template reinterpret<T>(sycl::range<1>{n}) };
597 queue.submit([&](sycl::handler& cgh) {
598 auto ax = xt.template get_access<sycl::access::mode::read>(cgh);
599 auto ay = yt.template get_access<sycl::access::mode::read>(cgh);
600 auto az = zt.template get_access<sycl::access::mode::read_write>(cgh);
601 cgh.single_task([=]{ az[n-1] = ax[n-1] * ay[n-1]; });
605#ifdef PRINT_DEBUG_INFO
607 sycl::host_accessor<T, 1, sycl::access::mode::read> x_acc(x);
608 sycl::host_accessor<T, 1, sycl::access::mode::read> y_acc(y);
609 sycl::host_accessor<T, 1, sycl::access::mode::read> z_acc(z);
610 for(
int il=0;il<x.size();++il)
612 std::cout<<
"X Y Z ["<<il<<
"] : "<<x_acc[il]<<
"*"<<y_acc[il]<<
"="<<z_acc[il]<<std::endl ;
618 template <
typename T>
619 void copy(sycl::buffer<T>& x,
625 m_env->internal()->queue().submit(
626 [&](sycl::handler& cgh)
628 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
629 auto access_y = y.template get_access<sycl::access::mode::discard_write>(cgh);
630 cgh.copy(access_x,access_y) ;
645 template <
typename T>
646 void copy(sycl::buffer<T>& x,
651 sycl::range<1> work_items{ m_total_threads };
654 m_env->internal()->queue().submit( [&](sycl::handler& cgh)
656 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
657 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
658 auto x_length = x.size()/stride_x ;
659 cgh.parallel_for<
class vector_copy>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
661 auto id = itemId.get_id(0);
662 for (
auto i =
id; i < x_length; i += itemId.get_range()[0])
663 access_y[i*stride_y] = access_x[i*stride_x];
670 template <
typename T>
676 template <
typename index_t>
677 inline index_t round_up(
const index_t x,
const index_t y)
679 return ((x + y - 1) / y) * y;
682 template <
typename T>
685 template <
typename T>
686 T reduce_sum(sycl::buffer<T>& x,
690 auto& w = getWorkBuffer<T>(x.size());
693 m_env->internal()->queue().submit( [&](sycl::handler& cgh)
695 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
696 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
698 auto access_w = sycl::accessor { w, cgh, sycl::write_only, sycl::property::no_init{}};
700 auto y_length = y.size() ;
701 cgh.parallel_for<
class vector_dot>(sycl::range<1>{m_total_threads}, [=] (sycl::item<1> itemId)
703 auto id = itemId.get_id(0);
704 for (
auto i =
id; i < y_length; i += itemId.get_range()[0])
705 access_w[i] = access_x[i]*access_y[i];
710 std::size_t local = m_max_work_group_size;
711 std::size_t length = x.size();
718 auto round_length = round_up(length, local);
720 auto f = [length, round_length, local, &w](sycl::handler& h)
mutable
722 sycl::nd_range<1> range{sycl::range<1>{round_length},
723 sycl::range<1>{local}};
724 auto access_w = w.template get_access<sycl::access::mode::read_write>(h);
728 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
732 h.parallel_for<
class sycl_reduction_sum_T>(range,
733 [access_w, scratch, local, length](sycl::nd_item<1> id)
735 std::size_t globalid =
id.get_global_id(0);
736 std::size_t localid =
id.get_local_id(0);
743 scratch[localid] = (globalid < length)? access_w[globalid] : 0. ;
744 id.barrier(sycl::access::fence_space::local_space);
748 if (globalid < length)
751 std::size_t min = local ;
752 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
755 if (localid < offset)
757 scratch[localid] += scratch[localid + offset];
759 id.barrier(sycl::access::fence_space::local_space);
764 access_w[
id.get_group(0)] = scratch[localid];
770 m_env->internal()->queue().submit(f);
774 length = (length + local - 1) / local;
776 }
while (length > 1);
779 auto h_w = w.get_host_access();
784 template <
typename T>
787 template <
typename T>
790 template <
typename T>
791 T map_reduce_sum(sycl::buffer<T>& x,
794 auto& w = getWorkBuffer<T>(x.size());
796 std::size_t local = m_max_work_group_size;
797 std::size_t length = x.size();
805 auto round_length = round_up(length, local);
807 auto f0 = [length, round_length, local, &x,&y, &w](sycl::handler& h)
mutable
809 sycl::nd_range<1> range{sycl::range<1>{round_length},
810 sycl::range<1>{local}};
811 auto access_x = x.template get_access<sycl::access::mode::read>(h);
812 auto access_y = y.template get_access<sycl::access::mode::read>(h);
814 auto access_w = sycl::accessor { w, h, sycl::read_write, sycl::property::no_init{}};
818 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
822 h.parallel_for<
class sycl_map_reduction_sum0_T>(range,
823 [access_x,access_y,access_w, scratch, local, length](sycl::nd_item<1> id)
825 std::size_t globalid =
id.get_global_id(0);
826 std::size_t localid =
id.get_local_id(0);
833 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
835 id.barrier(sycl::access::fence_space::local_space);
839 if (globalid < length)
842 std::size_t min = local ;
843 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
846 if (localid < offset)
848 scratch[localid] += scratch[localid + offset];
850 id.barrier(sycl::access::fence_space::local_space);
855 access_w[
id.get_group(0)] = scratch[localid];
863 auto f1 = [length, round_length, local, &w](sycl::handler& h)
mutable
865 sycl::nd_range<1> range{sycl::range<1>{round_length},
866 sycl::range<1>{local}};
867 auto access_w = w.template get_access<sycl::access::mode::read_write>(h);
871 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
875 h.parallel_for<
class sycl_map_reduction_sum_T>(range,
876 [access_w, scratch, local, length](sycl::nd_item<1> id)
878 std::size_t globalid =
id.get_global_id(0);
879 std::size_t localid =
id.get_local_id(0);
886 scratch[localid] = (globalid < length)? access_w[globalid] : 0. ;
887 id.barrier(sycl::access::fence_space::local_space);
891 if (globalid < length)
894 std::size_t min = local ;
895 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
898 if (localid < offset)
900 scratch[localid] += scratch[localid + offset];
902 id.barrier(sycl::access::fence_space::local_space);
907 access_w[
id.get_group(0)] = scratch[localid];
914 m_env->internal()->queue().submit(f0);
916 m_env->internal()->queue().submit(f1);
920 length = (length + local - 1) / local;
924 auto h_x = w.get_host_access();
927 for (std::size_t i = 0; i < length; ++i)
932 template <
typename T>
935 template <
typename T>
936 T map2_reduce_sum(sycl::buffer<T>& x,
939 std::size_t local = m_max_work_group_size;
940 std::size_t total_threads = m_total_threads;
941 std::size_t length = x.size();
944 sycl::buffer<T> sum{ &sum_init, 1 };
950 auto round_length = round_up(length, local);
952 auto f0 = [length, round_length,total_threads, local, &x, &y, &sum](sycl::handler& h)
mutable
954 sycl::nd_range<1> range{sycl::range<1>{std::min(total_threads,round_length)},
955 sycl::range<1>{local}};
956 auto access_x = x.template get_access<sycl::access::mode::read>(h);
957 auto access_y = y.template get_access<sycl::access::mode::read>(h);
959 sycl::accessor access_sum {sum, h};
961 auto sumReduction = sycl::reduction(access_sum, sycl::plus<T>());
964 auto sumReduction = sycl::reduction(sum, h, sycl::plus<T>());
967 auto sumReduction = sycl::reduction(sum, h, sycl::plus<T>());
972 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
976 h.parallel_for<
class sycl_map2_reduction_sum0_T>(range,
978 [access_x,access_y, scratch, local,total_threads,length](sycl::nd_item<1> id, auto &sum)
980 std::size_t globalid =
id.get_global_id(0);
981 std::size_t localid =
id.get_local_id(0);
988 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
989 for (
auto i = globalid+total_threads; i < length; i += total_threads)
990 scratch[localid] += access_x[i]*access_y[i];
992 id.barrier(sycl::access::fence_space::local_space);
999 std::size_t min = local ;
1000 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1003 if (localid < offset)
1005 scratch[localid] += scratch[localid + offset];
1007 id.barrier(sycl::access::fence_space::local_space);
1013 sum += scratch[localid];
1019 m_env->internal()->queue().submit(f0);
1022 auto h_sum = sum.get_host_access();
1026 template <
typename T>
1029 template <
typename T>
1030 T map3_reduce_sum(sycl::buffer<T>& x,
1033 std::size_t local = m_max_work_group_size;
1034 std::size_t total_threads = m_total_threads;
1035 std::size_t length = x.size();
1039 auto& group_sum = getWorkBuffer<T>(m_max_num_groups);
1044 auto round_length = round_up(length, local);
1051 auto f0 = [total_threads,round_length, length, local, &x, &y,&group_sum](sycl::handler& h)
mutable
1053 sycl::nd_range<1> range{sycl::range<1>{std::min(total_threads,round_length)},
1054 sycl::range<1>{local}};
1055 auto access_x = x.template get_access<sycl::access::mode::read>(h);
1056 auto access_y = y.template get_access<sycl::access::mode::read>(h);
1058 auto access_sum = sycl::accessor { group_sum, h, sycl::read_write, sycl::property::no_init{}};
1065 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1070 h.parallel_for<
class sycl_map3_reduction_sum0_T>(range,
1073 [access_x,access_y,access_sum,scratch,local,length,total_threads] (sycl::nd_item<1> id)
1075 std::size_t globalid =
id.get_global_id(0);
1076 std::size_t localid =
id.get_local_id(0);
1078 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
1080 for (
auto i = globalid+total_threads; i < length; i += total_threads)
1081 scratch[localid] += access_x[i]*access_y[i];
1083 id.barrier(sycl::access::fence_space::local_space);
1089 std::size_t min = local ;
1090 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1092 if (localid < offset)
1094 scratch[localid] += scratch[localid + offset];
1096 id.barrier(sycl::access::fence_space::local_space);
1102 access_sum[
id.get_group(0)] = scratch[localid];
1109 m_env->internal()->queue().submit(f0);
1115 auto h_sum = group_sum.get_host_access();
1117 for (std::size_t i = 0; i < std::min(total_threads, round_length) / local; ++i)
1122 template <
typename T>
1125 template <
typename T>
1126 void asynch_map4_reduce_sum(sycl::buffer<T>& x,
1128 sycl::buffer<T>& res,
1131 std::size_t local = m_max_work_group_size;
1132 std::size_t total_threads = m_total_threads;
1133 std::size_t length = x.size();
1142 auto round_length = round_up(length, local);
1144 auto f0 = [total_threads, round_length, length, local, &x, &y,&res](sycl::handler& h)
mutable
1146 sycl::nd_range<1> range{sycl::range<1>{std::min(total_threads,round_length)},
1147 sycl::range<1>{local}};
1148 auto access_x = x.template get_access<sycl::access::mode::read>(h);
1149 auto access_y = y.template get_access<sycl::access::mode::read>(h);
1151 sycl::accessor access_sum {res, h};
1152 auto sumReduction = sycl::reduction(access_sum, sycl::plus<T>());
1155 auto sumReduction = sycl::reduction(res, h, sycl::plus<T>());
1158 auto sumReduction = sycl::reduction(res, h, sycl::plus<T>());
1168 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1173 h.parallel_for<
class map4_reduction_sum_T>(range,
1175 [access_x,access_y,scratch,local,length,total_threads] (sycl::nd_item<1> id, auto& sum)
1177 std::size_t globalid =
id.get_global_id(0);
1178 std::size_t localid =
id.get_local_id(0);
1180 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
1182 for (
auto i = globalid+total_threads; i < length; i += total_threads)
1183 scratch[localid] += access_x[i]*access_y[i];
1185 id.barrier(sycl::access::fence_space::local_space);
1192 std::size_t min = local ;
1193 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1196 if (localid < offset)
1198 scratch[localid] += scratch[localid + offset];
1200 id.barrier(sycl::access::fence_space::local_space);
1207 sum += scratch[localid] ;
1213 event = m_env->internal()->queue().submit(f0);
1218 template <
typename T>
1219 T end_map4_reduce_sum(sycl::event& event,
1220 sycl::buffer<T>& res,
1221 std::size_t num_blocks)
1224 auto h_access = res.get_host_access();
1228 template <
typename T>
1231 template <
typename T>
1234 template <
typename T>
1235 void asynch_map5_reduce_sum(sycl::buffer<T>& x,
1237 sycl::buffer<T>& res,
1240 std::size_t local = m_max_work_group_size;
1241 std::size_t total_threads = m_total_threads;
1242 std::size_t length = x.size();
1250 auto f0 = [total_threads, length, local, &x, &y,&res](sycl::handler& h)
mutable
1252 sycl::nd_range<1> range{sycl::range<1>{total_threads},
1253 sycl::range<1>{local}};
1254 auto access_x = x.template get_access<sycl::access::mode::read>(h);
1255 auto access_y = y.template get_access<sycl::access::mode::read>(h);
1256 auto access_sum = sycl::accessor { res, h, sycl::read_write, sycl::property::no_init{}};
1260 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1265 h.parallel_for<
class map5_reduction_sum_T>(range,
1266 [access_x,access_y,access_sum,scratch,local,length,total_threads] (sycl::nd_item<1> id)
1268 std::size_t globalid =
id.get_global_id(0);
1269 std::size_t localid =
id.get_local_id(0);
1271 scratch[localid] = (globalid < length)? access_x[globalid]*access_y[globalid] : 0. ;
1273 for (
auto i = globalid+total_threads; i < length; i += total_threads)
1274 scratch[localid] += access_x[i]*access_y[i];
1276 id.barrier(sycl::access::fence_space::local_space);
1280 if (globalid < length)
1283 for (std::size_t offset = local / 2; offset > 0; offset /= 2)
1286 if (localid < offset)
1288 scratch[localid] += scratch[localid + offset];
1290 id.barrier(sycl::access::fence_space::local_space);
1296 access_sum[
id.get_group(0)] = scratch[localid];
1303 auto f1 = [length, local, &res](sycl::handler& h)
mutable {
1304 sycl::nd_range<1> range{ sycl::range<1>{ local },
1305 sycl::range<1>{ local } };
1306 auto access_sum = res.template get_access<sycl::access::mode::read_write>(h);
1310 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1314 h.parallel_for<
class map5_reduction_sum1_T>(range,
1315 [access_sum, scratch, local, length](sycl::nd_item<1> id) {
1317 std::size_t globalid =
id.get_global_id(0);
1318 std::size_t localid =
id.get_local_id(0);
1325 scratch[localid] = (localid < length) ? access_sum[localid] : 0.;
1326 id.barrier(sycl::access::fence_space::local_space);
1330 if (localid < length) {
1332 std::size_t min = local;
1333 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1336 if (localid < offset) {
1337 scratch[localid] += scratch[localid + offset];
1339 id.barrier(sycl::access::fence_space::local_space);
1343 access_sum[0] = scratch[localid];
1350 event = m_env->internal()->queue().submit(f0);
1352 event = m_env->internal()->queue().submit(f1);
1358 length = (std::min(total_threads, length) + local - 1) / local;
1360 }
while (length > 1);
1371 template <
typename T>
1372 T end_map5_reduce_sum(sycl::event& event,
1373 sycl::buffer<T>& res,
1374 std::size_t num_blocks)
1377 auto h_access = res.get_host_access();
1381 template <
typename T>
1382 T reduce_sum2(
const std::vector<T>& x)
1390 auto num_groups = m_env->internal()->queue().get_device().get_info<sycl::info::device::max_compute_units>();
1392 auto work_group_size = m_env->internal()->queue().get_device().get_info<sycl::info::device::max_work_group_size>();
1394 auto total_threads = num_groups * work_group_size;
1405 auto device = m_env->internal()->queue().get_device();
1407 std::size_t local = device.get_info<sycl::info::device::max_work_group_size>();
1409 std::size_t length = x.size();
1411 sycl::buffer<T, 1> xbuf(x.data(), sycl::range<1>(x.size()));
1412 xbuf.set_final_data(
nullptr);
1420 auto round_length = round_up(length, local);
1421 std::cout <<
"LENGTH :" << level <<
" " << length <<
" " << round_length <<
" " << local << std::endl;
1423 auto f = [length,round_length,local, &xbuf](sycl::handler& h)
mutable
1427 sycl::nd_range<1> r{sycl::range<1>{round_length},
1428 sycl::range<1>{local}};
1431 auto x_access = xbuf.template get_access<sycl::access::mode::read_write>(h);
1435 sycl::local_accessor<T> scratch{sycl::range<1>(local), h};
1439 h.parallel_for<
class reduce_sum2>(r, [x_access, scratch, local, length](sycl::nd_item<1> id)
1441 std::size_t globalid =
id.get_global_id(0);
1442 std::size_t localid =
id.get_local_id(0);
1449 scratch[localid] = (globalid < length)? x_access[globalid] : 0. ;
1450 id.barrier(sycl::access::fence_space::local_space);
1455 if (globalid < length)
1459 for (std::size_t offset = min / 2; offset > 0; offset /= 2)
1462 if (localid < offset)
1464 scratch[localid] += scratch[localid + offset];
1466 id.barrier(sycl::access::fence_space::local_space);
1471 x_access[
id.get_group(0)] = scratch[localid];
1478 m_env->internal()->queue().submit(f);
1482 length = (length + local - 1) / local;
1483 std::cout <<
"AFTER LENGTH :" << level <<
" new length" << length <<
" " << local << std::endl;
1485 }
while (length > 1);
1490 auto hI = xbuf.get_host_acces();
1497 template <
typename T>
1498 T sycl_reduce_sum(sycl::buffer<T>& x,
1502 sycl::buffer<T> sum_buff{ &sum_init, 1 };
1505 m_env->internal()->queue().submit([&](sycl::handler &cgh)
1507 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1508 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
1510 sycl::accessor sum_acc {sum_buff, cgh};
1511 auto sumReduction = sycl::reduction(sum_acc, sycl::plus<T>());
1514 auto sumReduction = sycl::reduction(sum_buff, cgh, sycl::plus<T>());
1517 auto sumReduction = sycl::reduction(sum_buff, cgh, sycl::plus<T>());
1519 cgh.parallel_for(sycl::range<1>{x.size()},
1521 [=](sycl::id<1> idx,
auto &sum)
1523 sum += access_x[idx]*access_y[idx];
1528 return sum_buff.get_host_access()[0];
1718 template <
typename T>
1719 inline T dot_product_h100(sycl::buffer<T>& buf_x,
1720 sycl::buffer<T>& buf_y)
1722 using namespace sycl;
1723 auto& q = m_env->internal()->queue();
1724 const size_t N = buf_x.size();
1725 assert(buf_y.size() == N);
1727 auto dev = q.get_device();
1732 const size_t blocks_needed = (N + WG_SIZE * ITEMS_PER_WI - 1)
1733 / (WG_SIZE * ITEMS_PER_WI);
1734 const size_t num_blocks = std::max(blocks_needed,
1735 m_max_num_groups * TARGET_WAVES);
1736 const size_t total_threads = num_blocks * WG_SIZE;
1740 sycl::buffer<T> partials{num_blocks};
1746 auto ax = buf_x.template get_access<access::mode::read>(cgh);
1747 auto ay = buf_y.template get_access<access::mode::read>(cgh);
1748 auto ap = partials.template get_access<access::mode::read_write>(cgh);
1750 local_accessor<T,1> lds{WG_SIZE / WARP_SIZE, cgh};
1757 cgh.parallel_for<
class dot_h100_phase1>(
1758 nd_range<1>{{total_threads}, {WG_SIZE}},
1759 [=](nd_item<1> item)
1760 [[intel::reqd_sub_group_size(WARP_SIZE)]]
1762 h100::dot_kernel_h100(item, ax, ay, ap, n, lds);
1773 auto h_partials = partials.get_host_access();
1776 T sum = 0.0, c = 0.0;
1777 for (
size_t b = 0; b < num_blocks; ++b) {
1778 T z = h_partials[b] - c;
1786 template <
typename T>
1787 inline std::size_t asynch_dot_product_h100(sycl::buffer<T>& buf_x,
1788 sycl::buffer<T>& buf_y,
1789 sycl::buffer<T>& res,
1792 using namespace sycl;
1793 auto& q = m_env->internal()->queue();
1794 const size_t N = buf_x.size();
1795 assert(buf_y.size() == N);
1802 const size_t blocks_needed = (N + WG_SIZE * ITEMS_PER_WI - 1)
1803 / (WG_SIZE * ITEMS_PER_WI);
1804 const size_t num_blocks = std::max(blocks_needed,m_max_num_groups * TARGET_WAVES);
1805 const size_t total_threads = num_blocks * WG_SIZE;
1814 auto ax = buf_x.template get_access<access::mode::read>(cgh);
1815 auto ay = buf_y.template get_access<access::mode::read>(cgh);
1816 auto ap = res.template get_access<sycl::access::mode::read_write>(cgh);
1818 local_accessor<T,1> lds{WG_SIZE / WARP_SIZE, cgh};
1825 cgh.parallel_for<
class dot_h100_phase1>(
1826 nd_range<1>{{total_threads}, {WG_SIZE}},
1827 [=](nd_item<1> item)
1828 [[intel::reqd_sub_group_size(WARP_SIZE)]]
1830 h100::dot_kernel_h100(item, ax, ay, ap, n, lds);
1837 template <
typename T>
1838 inline T end_dot_product_h100(sycl::event& event,
1839 sycl::buffer<T>& res,
1840 std::size_t num_blocks)
1850 auto h_partials = res.get_host_access();
1853 T sum = 0.0, c = 0.0;
1854 for (
size_t b = 0; b < num_blocks; ++b) {
1855 T z = h_partials[b] - c;
1868 template<
typename T>
1869 inline T dot_product_mi300(sycl::buffer<T, 1>& x_buf,
1870 sycl::buffer<T, 1>& y_buf)
1872 using namespace mi300 ;
1873 std::cout<<
" DOT PROD MI300 : "<<WG_SIZE<<
" "<<ITEMS_PER_WI<<std::endl ;
1875 bool use_doublebuf =
false;
1876 auto& q = m_env->internal()->queue();
1907 std::size_t N = x_buf.size() ;
1909 const size_t blocks_needed = (N + WG_SIZE * ITEMS_PER_WI - 1)
1910 / (WG_SIZE * ITEMS_PER_WI);
1913 const size_t num_blocks = std::max(blocks_needed,
1914 m_max_num_groups * TARGET_WAVES
1915 * (WG_SIZE / WAVEFRONT));
1916 const size_t total_threads = num_blocks * WG_SIZE;
1919 sycl::buffer<T, 1> partials(num_blocks);
1921 q.submit([&](sycl::handler& cgh)
1923 auto ax = x_buf.template get_access<sycl::access::mode::read>(cgh);
1924 auto ay = y_buf.template get_access<sycl::access::mode::read>(cgh);
1925 auto ap = partials.template get_access<sycl::access::mode::read_write>(cgh);
1926 sycl::local_accessor<double,1> lds{N_WARPS_BLOC, cgh};
1932 const bool db = use_doublebuf;
1934 cgh.parallel_for<
class dot_mi300_phase1>(
1935 sycl::nd_range<1>{{total_threads}, {WG_SIZE}},
1939 [=](sycl::nd_item<1> item)
1940 [[intel::reqd_sub_group_size(WAVEFRONT)]]
1943 mi300::dot_kernel_mi300_doublebuf(item, ax, ay, ap, n, lds);
1945 mi300::dot_kernel_mi300(item, ax, ay, ap, n, lds);
1953 auto h_partials = partials.get_host_access();
1957 for (std::size_t b = 0; b < num_blocks; ++b)
1959 T z = h_partials[b] - c;
1967 template <
typename T>
1968 inline void asynch_dot_product_mi300(sycl::buffer<T, 1>& x_buf,
1969 sycl::buffer<T, 1>& y_buf,
1970 sycl::buffer<T, 1>& result_buf,
1973 using namespace mi300 ;
1975 std::size_t n = x_buf.size() ;
1976 auto& q = m_env->internal()->queue();
1978 const std::size_t stride = WG_SIZE * ITEMS_PER_WI;
1979 const std::size_t n_padded = ((n + stride - 1) / stride) * stride;
1981 event = q.submit([&](sycl::handler& cgh) {
1982 auto x = x_buf.template get_access<sycl::access::mode::read>(cgh);
1983 auto y = y_buf.template get_access<sycl::access::mode::read>(cgh);
1984 auto red = sycl::reduction(result_buf, cgh, sycl::plus<T>{});
1987 sycl::nd_range<1>{n_padded, WG_SIZE}, red,
1988 [=](sycl::nd_item<1> item,
auto& sum) {
1989 const std::size_t lid = item.get_local_id(0);
1990 const std::size_t base = item.get_group(0) * stride;
1993 for (std::size_t k = 0; k < ITEMS_PER_WI; ++k) {
1994 const std::size_t idx = base + lid + k * WG_SIZE;
1995 if (idx < n) acc += x[idx] * y[idx];
2004 template <
typename T>
2005 T end_dot_product_mi300(sycl::event& event,
2006 sycl::buffer<T>& res,
2007 std::size_t num_blocks)
2010 auto h_access = res.get_host_access();
2014 template <
typename T>
2015 T dot(sycl::buffer<T>& x,
2018 switch (m_dot_algo) {
2020 return reduce_sum(x, y);
2022 return map_reduce_sum(x, y);
2024 return map2_reduce_sum(x, y);
2026 return map3_reduce_sum(x, y);
2028 return dot_product_h100(x, y);
2030 return dot_product_mi300(x, y);
2032 return sycl_reduce_sum(x, y);
2036 template <
typename T>
2037 void dot(sycl::buffer<T>& x,
2044 asynch_map4_reduce_sum(x, y, res.deviceValue(), res.event());
2045 res.setWaitFunction([=](sycl::event& event, sycl::buffer<double>& res, std::size_t num_blocks)
2047 return this->end_map4_reduce_sum(event,res, num_blocks) ;
2052 std::size_t num_blocks = asynch_dot_product_h100(x, y, res.deviceValue(), res.event());
2053 res.setWaitFunction([=](sycl::event& event, sycl::buffer<double>& res, std::size_t num_blocks)
2055 return this->end_dot_product_h100(event, res, num_blocks) ;
2057 res.setNumBlocks(num_blocks) ;
2061 asynch_dot_product_mi300(x, y, res.deviceValue(), res.event());
2062 res.setWaitFunction([=](sycl::event& event, sycl::buffer<T>& res, std::size_t num_blocks)
2064 return this->end_dot_product_mi300(event,res, num_blocks) ;
2068 asynch_map5_reduce_sum(x, y, res.deviceValue(), res.event());
2069 res.setWaitFunction([=](sycl::event& event, sycl::buffer<T>& res, std::size_t num_blocks)
2071 return this->end_map5_reduce_sum(event,res, num_blocks) ;
2079 SYCLEnv* m_env = nullptr ;
2080 std::size_t m_max_num_groups = 0 ;
2081 std::size_t m_max_work_group_size = 0 ;
2082 std::size_t m_total_threads = 0 ;
2085 template <
typename T>
2086 sycl::buffer<T>& getWorkBuffer(std::size_t size);
2088 mutable sycl::buffer<double>* m_double_work =
nullptr;