12#ifndef ARCCORE_ACCELERATOR_SCANIMPL_H
13#define ARCCORE_ACCELERATOR_SCANIMPL_H
18#include "arccore/base/FatalErrorException.h"
20#include "arccore/common/NumArray.h"
21#include "arccore/common/accelerator/RunQueue.h"
22#include "arccore/common/accelerator/RunCommandLaunchInfo.h"
24#include "arccore/accelerator/CommonUtils.h"
30namespace Arcane::Accelerator::impl
36#if defined(ARCCORE_COMPILING_SYCL)
46template <
bool IsExclusive,
typename DataType,
typename Operator>
53 DataType _getInputValue(Int32 index)
const
55 DataType local_value = identity;
56 if constexpr (IsExclusive) {
58 local_value = init_value;
60 local_value = ((index - 1) < nb_value) ? input_values[index - 1] : identity;
63 local_value = (index < nb_value) ? input_values[index] : identity;
69 SmallSpan<const DataType> input_values;
70 DataType identity = {};
71 DataType init_value = {};
77 void doScan(RunQueue& rq, SmallSpan<const DataType> input, SmallSpan<DataType> output, DataType init_value)
79 DataType identity = Operator::defaultValue();
80 sycl::queue q = Impl::SyclUtils::toNativeStream(&rq);
82 NumArray<DataType, MDDim1> tmp;
84 NumArray<DataType, MDDim1> tmp2;
85 Int32 nb_item = input.size();
86 Int32 block_size = 256;
87 Int32 nb_block = (nb_item / block_size);
88 if ((nb_item % block_size) != 0)
91 tmp2.resize(nb_block);
93 input_info.nb_value = nb_item;
94 input_info.init_value = init_value;
95 input_info.identity = identity;
96 input_info.input_values = input;
98 std::cout <<
"DO_SCAN nb_item=" << nb_item <<
" nb_block=" << nb_block <<
"\n";
99 doscan1(q, input_info, tmp.to1DSpan(), nb_item, block_size);
101 for (
int i = 0; i < nb_block; ++i)
102 std::cout <<
"DO_SCAN_X1 i=" << i <<
" tmp[i]=" << tmp[i] <<
"\n";
103 doscan2(q, tmp.to1DSpan(), nb_block, block_size, identity);
105 for (
int i = 0; i < nb_block; ++i)
106 std::cout <<
"DO_SCAN_X2 i=" << i <<
" tmp[i]=" << tmp[i] <<
"\n";
107 doscan2_bis(q, tmp.to1DSpan(), tmp2.to1DSpan(), nb_block, block_size, identity);
109 for (
int i = 0; i < nb_block; ++i)
110 std::cout <<
"DO_SCAN_X2_BIS i=" << i <<
" tmp[i]=" << tmp[i] <<
" tmp2[i]=" << tmp2[i] <<
"\n";
111 doscan3(q, input_info, output, tmp2, nb_item, block_size);
116 void doscan1(sycl::queue& q,
const InputInfo& input_info, Span<DataType> tmp,
117 int nb_value,
int block_size)
120 std::cout <<
"DO_SCAN1 nb_value=" << nb_value <<
" L=" << block_size <<
"\n";
123 q.submit([&](sycl::handler& h) {
124 auto local = sycl::local_accessor<DataType, 1>(block_size, h);
125 h.parallel_for(_getNDRange(nb_value, block_size), [=](sycl::nd_item<1> it) {
126 const int i =
static_cast<int>(it.get_global_id(0));
127 const int li =
static_cast<int>(it.get_local_id(0));
128 const int gid =
static_cast<int>(it.get_group(0));
129 const int local_range0 =
static_cast<int>(it.get_local_range()[0]);
131 DataType local_value = input_info._getInputValue(i);
132 local[li] = sycl::inclusive_scan_over_group(it.get_group(), local_value, scan_op.syclFunctor());
134 if (li == local_range0 - 1)
135 tmp[gid] = local[li];
141 void doscan2(sycl::queue& q, Span<DataType> tmp,
int nb_block,
const int block_size, DataType identity)
144 std::cout <<
"DO_SCAN2 nb_block=" << nb_block <<
" block_size=" << block_size <<
"\n";
147 q.submit([&](sycl::handler& h) {
148 auto local = sycl::local_accessor<DataType, 1>(block_size, h);
149 h.parallel_for(_getNDRange(nb_block, block_size), [=](sycl::nd_item<1> it) {
150 int i =
static_cast<int>(it.get_global_id(0));
151 int li =
static_cast<int>(it.get_local_id(0));
153 DataType local_value = (i < nb_block) ? tmp[i] : identity;
154 local[li] = sycl::inclusive_scan_over_group(it.get_group(), local_value, scan_op.syclFunctor());
163 void doscan2_bis(sycl::queue& q, Span<const DataType> tmp, Span<DataType> tmp2,
int nb_block,
int block_size, DataType identity)
166 std::cout <<
"DO_SCAN2_bis nb_block=" << nb_block <<
" L=" << block_size <<
"\n";
168 q.parallel_for(_getNDRange(nb_block, block_size), [=](sycl::nd_item<1> it) {
169 const int g =
static_cast<int>(it.get_group(0));
170 const int i =
static_cast<int>(it.get_global_id(0));
172 DataType init_value = identity;
173 for (
int j = 1; j <= g; ++j)
174 init_value = scan_op(init_value, tmp[(j * block_size) - 1]);
175 tmp2[i] = scan_op(init_value, tmp[i]);
181 void doscan3(sycl::queue& q,
const InputInfo& input_info, SmallSpan<DataType> output, SmallSpan<DataType> tmp2,
int nb_value,
int block_size)
184 std::cout <<
"DO_SCAN3 nb_value=" << nb_value <<
" L=" << block_size <<
"\n";
187 q.parallel_for(_getNDRange(nb_value, block_size), [=](sycl::nd_item<1> it) {
188 const int i =
static_cast<int>(it.get_global_id(0));
189 const int g =
static_cast<int>(it.get_group(0));
190 DataType local_value = input_info._getInputValue(i);
191 DataType output_value = sycl::inclusive_scan_over_group(it.get_group(), local_value, scan_op.syclFunctor());
193 output[i] = (g > 0) ? scan_op(output_value, tmp2[g - 1]) : output_value;
201 bool m_is_verbose =
false;
206 sycl::nd_range<1> _getNDRange(Int32 nb_value, Int32 block_size)
208 int x = nb_value / block_size;
209 if ((nb_value % block_size) != 0)
212 return sycl::nd_range<1>(x, block_size);
Types et macros pour gérer les boucles sur les accélérateurs.
Types et fonctions associés aux classes ArrayView et ConstArrayView.