Arcane  4.1.12.0
User documentation
Loading...
Searching...
No Matches
MultiThreadAlgo.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* MultiThreadAlgo.h (C) 2000-2026 */
9/* */
10/* Implementation of accelerator algorithms in multi-thread mode. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ACCELERATOR_MULTITHREADALGO_H
13#define ARCCORE_ACCELERATOR_MULTITHREADALGO_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arccore/common/SmallArray.h"
18
19#include "arccore/base/ForLoopRunInfo.h"
20#include "arccore/concurrency/ParallelFor.h"
21
22#include "arccore/accelerator/AcceleratorGlobal.h"
23
24/*---------------------------------------------------------------------------*/
25/*---------------------------------------------------------------------------*/
26
27namespace Arcane::Accelerator::Impl
28{
29
30/*---------------------------------------------------------------------------*/
31/*---------------------------------------------------------------------------*/
32
33/*!
34 * \brief Advanced algorithms in multi-thread mode.
35 *
36 * Currently, only the Scan operation is implemented.
37 */
39{
40 public:
41
42 /*!
43 * \brief Multi-thread scan algorithm.
44 *
45 * \note This class is internal to Arcane. The public API version
46 * is accessible via the GenericScanner class.
47 *
48 * This basic algorithm uses two passes for calculation.
49 * The iteration interval is divided into N blocks. We take N = 2*nb_thread.
50 * - the first pass calculates the scan result in parallel for all
51 * elements of a block.
52 * - the second pass calculates the final value.
53 *
54 * The calculation always yields the same value for a given number of blocks.
55 *
56 * TODO: Use padding to maintain partial values per block.
57 * TODO: Create specialized versions if DataType is a base type
58 * such as 'Int32', 'Int64', 'float', or 'double'.
59 */
60 template <bool IsExclusive, typename DataType, typename Operator,
61 typename InputIterator, typename OutputIterator>
62 void doScan(ForLoopRunInfo run_info, Int32 nb_value,
63 InputIterator input, OutputIterator output,
64 DataType init_value, Operator op)
65 {
66 //std::cout << "DO_SCAN MULTI_THREAD nb_value=" << nb_value << " init_value=" << init_value << "\n";
67 auto multiple_getter_func = [=](Int32 input_index, Int32 nb_value) -> DataType {
68 DataType partial_value = Operator::defaultValue();
69 for (Int32 x = 0; x < nb_value; ++x)
70 partial_value = op(input[x + input_index], partial_value);
71 return partial_value;
72 };
73
74 auto multiple_setter_func = [=](DataType previous_sum, Int32 input_index, Int32 nb_value) {
75 for (Int32 x = 0; x < nb_value; ++x) {
76 if constexpr (IsExclusive) {
77 output[x + input_index] = previous_sum;
78 previous_sum = op(input[x + input_index], previous_sum);
79 }
80 else {
81 previous_sum = op(input[x + input_index], previous_sum);
82 output[x + input_index] = previous_sum;
83 }
84 }
85 };
86 // TODO: automatically calculate this value.
87 const Int32 nb_block = 10;
88
89 // Array to store partial values of the blocks.
90 // TODO: Use padding to avoid cache conflicts between threads.
91 SmallArray<DataType> partial_values(nb_block);
92 Span<DataType> out_partial_values = partial_values;
93
94 auto partial_value_func = [=](Int32 a, Int32 n) {
95 for (Int32 i = 0; i < n; ++i) {
96 Int32 interval_index = i + a;
97
98 Int32 input_index = 0;
99 Int32 nb_value_in_interval = 0;
100 _subInterval<Int32>(nb_value, interval_index, nb_block, &input_index, &nb_value_in_interval);
101
102 DataType partial_value = multiple_getter_func(input_index, nb_value_in_interval);
103
104 out_partial_values[interval_index] = partial_value;
105 }
106 };
107
108 ParallelLoopOptions loop_options(run_info.options().value_or(ParallelLoopOptions{}));
109 loop_options.setGrainSize(1);
110 run_info.addOptions(loop_options);
111
112 // Calculates partial sums for nb_block
113 Arcane::arccoreParallelFor(0, nb_block, run_info, partial_value_func);
114
115 auto final_sum_func = [=](Int32 a, Int32 n) {
116 for (Int32 i = 0; i < n; ++i) {
117 Int32 interval_index = i + a;
118
119 DataType previous_sum = init_value;
120 for (Int32 z = 0; z < interval_index; ++z)
121 previous_sum = op(out_partial_values[z], previous_sum);
122
123 Int32 input_index = 0;
124 Int32 nb_value_in_interval = 0;
125 _subInterval<Int32>(nb_value, interval_index, nb_block, &input_index, &nb_value_in_interval);
126
127 multiple_setter_func(previous_sum, input_index, nb_value_in_interval);
128 }
129 };
130
131 // Calculates the final values
132 Arcane::arccoreParallelFor(0, nb_block, run_info, final_sum_func);
133 }
134
135 template <bool InPlace, typename InputIterator, typename OutputIterator, typename SelectLambda>
136 Int32 doFilter(ForLoopRunInfo run_info, Int32 nb_value,
137 InputIterator input, OutputIterator output,
138 SelectLambda select_lambda)
139 {
140 // Index type.
141 using IndexType = Int32;
142
143 UniqueArray<bool> select_flags(nb_value);
144 Span<bool> select_flags_view = select_flags;
145 //std::cout << "DO_FILTER MULTI_THREAD nb_value=" << nb_value << "\n";
146 auto multiple_getter_func = [=](Int32 input_index, Int32 nb_value) -> IndexType {
147 IndexType partial_value = 0;
148 for (Int32 x = 0; x < nb_value; ++x) {
149 const Int32 index = x + input_index;
150 bool is_select = select_lambda(input[index]);
151 select_flags_view[index] = is_select;
152 if (is_select)
153 ++partial_value;
154 }
155 return partial_value;
156 };
157
158 auto multiple_setter_func = [=](IndexType partial_value, Int32 input_index, Int32 nb_value) {
159 for (Int32 x = 0; x < nb_value; ++x) {
160 const Int32 index = x + input_index;
161 if (select_flags_view[index]) {
162 output[partial_value] = input[index];
163 ++partial_value;
164 }
165 }
166 };
167
168 // TODO: automatically calculate this value.
169 const Int32 nb_block = 10;
170
171 // Array to store partial values of the blocks.
172 // TODO: Use padding to avoid cache conflicts between threads.
173 SmallArray<Int32> partial_values(nb_block, 0);
174 Span<Int32> out_partial_values = partial_values;
175
176 auto partial_value_func = [=](Int32 a, Int32 n) {
177 for (Int32 i = 0; i < n; ++i) {
178 Int32 interval_index = i + a;
179
180 Int32 input_index = 0;
181 Int32 nb_value_in_interval = 0;
182 _subInterval<Int32>(nb_value, interval_index, nb_block, &input_index, &nb_value_in_interval);
183
184 out_partial_values[interval_index] = multiple_getter_func(input_index, nb_value_in_interval);
185 }
186 };
187
188 ParallelLoopOptions loop_options(run_info.options().value_or(ParallelLoopOptions{}));
189 loop_options.setGrainSize(1);
190 run_info.addOptions(loop_options);
191
192 // Calculates partial sums for nb_block
193 Arcane::arccoreParallelFor(0, nb_block, run_info, partial_value_func);
194
195 // Calculates the number of filtered values
196 // Also calculates the accumulated value of partial_values
197 Int32 nb_filter = 0;
198 for (Int32 i = 0; i < nb_block; ++i) {
199 Int32 x = partial_values[i];
200 nb_filter += x;
201 partial_values[i] = nb_filter;
202 }
203
204 auto filter_func = [=](Int32 a, Int32 n) {
205 for (Int32 i = 0; i < n; ++i) {
206 Int32 interval_index = i + a;
207
208 IndexType partial_value = 0;
209 if (interval_index > 0)
210 partial_value = out_partial_values[interval_index - 1];
211
212 Int32 input_index = 0;
213 Int32 nb_value_in_interval = 0;
214 _subInterval<Int32>(nb_value, interval_index, nb_block, &input_index, &nb_value_in_interval);
215
216 multiple_setter_func(partial_value, input_index, nb_value_in_interval);
217 }
218 };
219
220 // If the input and output are the same, the filling is done sequentially.
221 // TODO: do it in parallel.
222 if (InPlace)
223 filter_func(0, nb_block);
224 else
225 Arcane::arccoreParallelFor(0, nb_block, run_info, filter_func);
226
227 return nb_filter;
228 }
229
230 private:
231
232 template <typename SizeType>
233 static void _subInterval(SizeType size, SizeType interval_index, SizeType nb_interval,
234 SizeType* out_begin_index, SizeType* out_interval_size)
235 {
236 *out_begin_index = 0;
237 *out_interval_size = 0;
238 if (nb_interval <= 0)
239 return;
240 if (interval_index < 0 || interval_index >= nb_interval)
241 return;
242 SizeType isize = size / nb_interval;
243 SizeType ibegin = interval_index * isize;
244 // For the last interval, take the remaining elements
245 if ((interval_index + 1) == nb_interval)
246 isize = size - ibegin;
247 *out_begin_index = ibegin;
248 *out_interval_size = isize;
249 }
250};
251
252/*---------------------------------------------------------------------------*/
253/*---------------------------------------------------------------------------*/
254
255} // namespace Arcane::Accelerator::Impl
256
257/*---------------------------------------------------------------------------*/
258/*---------------------------------------------------------------------------*/
259
260#endif
261
262/*---------------------------------------------------------------------------*/
263/*---------------------------------------------------------------------------*/
Advanced algorithms in multi-thread mode.
void doScan(ForLoopRunInfo run_info, Int32 nb_value, InputIterator input, OutputIterator output, DataType init_value, Operator op)
Multi-thread scan algorithm.
Loop execution information.
Execution options for a parallel loop in multi-threading.
void setGrainSize(Integer v)
Sets the size (approximate) of an iteration interval.
1D data array with pre-allocated stack buffer.
View of an array of elements of type T.
Definition Span.h:635
1D data vector with value semantics (STL style).
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applies the lambda function lambda_function concurrently over the iteration interval given by loop_ra...
Definition ParallelFor.h:87
std::int32_t Int32
Signed integer type of 32 bits.