Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
AcceleratorMemoryCopier.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* AcceleratorMemoryCopier.h (C) 2000-2026 */
9/* */
10/* Implementation of memory copy functions on accelerators. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ACCELERATOR_INTERNAL_ACCELERATORMEMORYCOPIER_H
13#define ARCCORE_ACCELERATOR_INTERNAL_ACCELERATORMEMORYCOPIER_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16
17#include "arccore/base/Ref.h"
18#include "arccore/base/FixedArray.h"
19#include "arccore/base/NotSupportedException.h"
20
21#include "arccore/common/accelerator/RunQueue.h"
22#include "arccore/common/internal/SpecificMemoryCopyList.h"
23
25
26/*---------------------------------------------------------------------------*/
27/*---------------------------------------------------------------------------*/
28
29namespace Arcane::Accelerator::Impl
30{
31
32using IndexedMemoryCopyArgs = Arcane::Impl::IndexedMemoryCopyArgs;
33using IndexedMultiMemoryCopyArgs = Arcane::Impl::IndexedMultiMemoryCopyArgs;
34
35/*---------------------------------------------------------------------------*/
36/*---------------------------------------------------------------------------*/
37
38template <typename DataType, typename Extent>
40: public Arcane::Impl::SpecificMemoryCopyBase<DataType, Extent>
41{
43 using BaseClass::_toTrueType;
44
45 public:
46
47 using BaseClass::m_extent;
48
49 public:
50
51 void copyFrom(const IndexedMemoryCopyArgs& args) override
52 {
53 _copyFrom(args.m_queue, args.m_indexes, _toTrueType(args.m_source), _toTrueType(args.m_destination));
54 }
55
56 void copyTo(const IndexedMemoryCopyArgs& args) override
57 {
58 _copyTo(args.m_queue, args.m_indexes, _toTrueType(args.m_source), _toTrueType(args.m_destination));
59 }
60
61 void fill(const IndexedMemoryCopyArgs& args) override
62 {
63 _fill(args.m_queue, args.m_indexes, _toTrueType(args.m_source), _toTrueType(args.m_destination));
64 }
65
66 void copyFrom(const IndexedMultiMemoryCopyArgs& args) override
67 {
68 _copyFrom(args.m_queue, args.m_indexes, args.m_multi_memory, _toTrueType(args.m_source_buffer));
69 }
70
71 void copyTo(const IndexedMultiMemoryCopyArgs& args) override
72 {
73 _copyTo(args.m_queue, args.m_indexes, args.m_const_multi_memory, _toTrueType(args.m_destination_buffer));
74 }
75
76 void fill(const IndexedMultiMemoryCopyArgs& args) override
77 {
78 _fill(args.m_queue, args.m_indexes, args.m_multi_memory, _toTrueType(args.m_source_buffer));
79 }
80
81 public:
82
83 void _copyFrom(const RunQueue* queue, SmallSpan<const Int32> indexes,
84 Span<const DataType> source, Span<DataType> destination)
85 {
87
88 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, indexes.data());
89 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, source.data());
90 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, destination.data());
91
92 Int32 nb_index = indexes.size();
93 const auto extent = m_extent;
94
95 auto command = makeCommand(queue);
96 command << RUNCOMMAND_LOOP1(iter, nb_index)
97 {
98 Int32 i = iter;
99 Int64 zindex = i * extent.size();
100 Int64 zci = indexes[i] * extent.size();
101 for (Int32 z = 0; z < extent.v; ++z)
102 destination[zindex + z] = source[zci + z];
103 };
104 }
105
106 void _copyFrom(const RunQueue* queue, SmallSpan<const Int32> indexes, SmallSpan<Span<std::byte>> multi_views,
108 {
110 if (arccoreIsCheck()) {
111 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, indexes.data());
112 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, source.data());
113 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, multi_views.data());
114 // Ideally, we should test the values of the elements of multi_views
115 // but if we do that, we can potentially perform transfers
116 // between the accelerator and the CPU.
117 }
118 const Int32 nb_index = indexes.size() / 2;
119 const auto extent = m_extent;
120
121 auto command = makeCommand(queue);
122 command << RUNCOMMAND_LOOP1(iter, nb_index)
123 {
124 auto [i] = iter();
125 Int32 index0 = indexes[i * 2];
126 Int64 index1 = indexes[(i * 2) + 1];
127 Span<std::byte> orig_view_bytes = multi_views[index0];
128 auto* orig_view_data = reinterpret_cast<DataType*>(orig_view_bytes.data());
129 // Uses a span to test array overflows but
130 // could directly use 'orig_view_data' for better performance
131 Span<DataType> orig_view = { orig_view_data, orig_view_bytes.size() / (Int64)sizeof(DataType) };
132 Int64 zci = index1 * extent.v;
133 Int64 z_index = i * extent.size();
134 for (Int32 z = 0, n = extent.v; z < n; ++z)
135 orig_view[zci + z] = source[z_index + z];
136 };
137 }
138
144 void _fill(const RunQueue* queue, SmallSpan<const Int32> indexes, Span<const DataType> source,
145 Span<DataType> destination)
146 {
148
149 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, indexes.data());
150 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, destination.data());
151 ARCCORE_CHECK_ACCESSIBLE_POINTER(eExecutionPolicy::Sequential, source.data());
152
153 Int32 nb_index = indexes.size();
154 const auto extent = m_extent;
155 constexpr Int32 max_size = 24;
156
157 // For now, we limit the size of DataType hardcoded.
158 // In the future, we should allocate on the device and deallocate at the end
159 // of execution (via cudaMallocAsync/cudaFreeAsync to manage asynchronous operations)
160 if (extent.v > max_size)
161 ARCCORE_THROW(NotSupportedException, "sizeof(type) is too big (v={0} max={1})",
162 sizeof(DataType) * extent.v, sizeof(DataType) * max_size);
164 for (Int32 z = 0; z < extent.v; ++z)
165 local_source[z] = source[z];
166 for (Int32 z = extent.v; z < max_size; ++z)
167 local_source[z] = {};
168
169 auto command = makeCommand(queue);
170 // If \a nb_index is 0, we fill all elements
171 if (nb_index == 0) {
172 Int32 nb_value = CheckedConvert::toInt32(destination.size() / extent.v);
173 command << RUNCOMMAND_LOOP1(iter, nb_value)
174 {
175 auto [i] = iter();
176 Int64 zci = i * extent.size();
177 for (Int32 z = 0; z < extent.v; ++z)
178 destination[zci + z] = local_source[z];
179 };
180 }
181 else {
182 command << RUNCOMMAND_LOOP1(iter, nb_index)
183 {
184 auto [i] = iter();
185 Int64 zci = indexes[i] * extent.size();
186 for (Int32 z = 0; z < extent.v; ++z)
187 destination[zci + z] = local_source[z];
188 };
189 }
190 }
191
192 void _fill(const RunQueue* queue, SmallSpan<const Int32> indexes, SmallSpan<Span<std::byte>> multi_views,
194 {
196
197 if (arccoreIsCheck()) {
198 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, indexes.data());
199 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(eExecutionPolicy::Sequential, source.data());
200 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, multi_views.data());
201 // Ideally, we should test the values of the elements of multi_views
202 // but if we do that, we can potentially perform transfers
203 // between the accelerator and the CPU.
204 }
205 const Int32 nb_index = indexes.size() / 2;
206 const auto extent = m_extent;
207 constexpr Int32 max_size = 24;
208
209 // For now, we limit the size of DataType hardcoded.
210 // In the future, we should allocate on the device and deallocate at the end
211 // of execution (via cudaMallocAsync/cudaFreeAsync to manage asynchronous operations)
212 if (extent.v > max_size)
213 ARCCORE_THROW(NotSupportedException, "sizeof(type) is too big (v={0} max={1})",
214 sizeof(DataType) * extent.v, sizeof(DataType) * max_size);
216 for (Int32 z = 0; z < extent.v; ++z)
217 local_source[z] = source[z];
218 for (Int32 z = extent.v; z < max_size; ++z)
219 local_source[z] = {};
220
221 if (nb_index == 0) {
222 // Fills all values of the array with the source.
223 // Since the number of elements in the second dimension depends on the first,
224 // we use a kernel per dimension.
225 RunQueue q(*queue);
227 const Int32 nb_dim1 = multi_views.size();
228 for (Int32 zz = 0; zz < nb_dim1; ++zz) {
229 Span<DataType> orig_view = Arccore::asSpan<DataType>(multi_views[zz]);
230 Int32 nb_value = CheckedConvert::toInt32(orig_view.size());
231 auto command = makeCommand(queue);
232 command << RUNCOMMAND_LOOP1(iter, nb_value)
233 {
234 auto [i] = iter();
235 orig_view[i] = local_source[i % extent.v];
236 };
237 }
238 }
239 else {
240 auto command = makeCommand(queue);
241 command << RUNCOMMAND_LOOP1(iter, nb_index)
242 {
243 auto [i] = iter();
244 Int32 index0 = indexes[i * 2];
245 Int64 index1 = indexes[(i * 2) + 1];
246 Span<std::byte> orig_view_bytes = multi_views[index0];
247 auto* orig_view_data = reinterpret_cast<DataType*>(orig_view_bytes.data());
248 // Uses a span to test array overflows but
249 // could directly use 'orig_view_data' for better performance
250 Span<DataType> orig_view = { orig_view_data, orig_view_bytes.size() / (Int64)sizeof(DataType) };
251 Int64 zci = index1 * extent.v;
252 for (Int32 z = 0, n = extent.v; z < n; ++z)
253 orig_view[zci + z] = local_source[z];
254 };
255 }
256 }
257
258 void _copyTo(const RunQueue* queue, SmallSpan<const Int32> indexes, Span<const DataType> source,
259 Span<DataType> destination)
260 {
262
263 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, indexes.data());
264 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, source.data());
265 ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, destination.data());
266
267 Int32 nb_index = indexes.size();
268 const auto extent = m_extent;
269
270 auto command = makeCommand(queue);
271 command << RUNCOMMAND_LOOP1(iter, nb_index)
272 {
273 auto [i] = iter();
274 Int64 zindex = i * extent.size();
275 Int64 zci = indexes[i] * extent.size();
276 for (Int32 z = 0; z < extent.v; ++z)
277 destination[zci + z] = source[zindex + z];
278 };
279 }
280 void _copyTo(const RunQueue* queue, SmallSpan<const Int32> indexes, SmallSpan<const Span<const std::byte>> multi_views,
281 Span<DataType> destination)
282 {
284
285 if (arccoreIsCheck()) {
286 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, indexes.data());
287 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, destination.data());
288 ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, multi_views.data());
289 // Ideally, we should test the values of the elements of multi_views
290 // but if we do that, we can potentially perform transfers
291 // between the accelerator and the CPU.
292 }
293
294 const Int32 nb_index = indexes.size() / 2;
295 const auto extent = m_extent;
296
297 auto command = makeCommand(queue);
298 command << RUNCOMMAND_LOOP1(iter, nb_index)
299 {
300 auto [i] = iter();
301 Int32 index0 = indexes[i * 2];
302 Int64 index1 = indexes[(i * 2) + 1];
303 Span<const std::byte> orig_view_bytes = multi_views[index0];
304 auto* orig_view_data = reinterpret_cast<const DataType*>(orig_view_bytes.data());
305 // Uses a span to test array overflows but
306 // could directly use 'orig_view_data' for better performance
307 Span<const DataType> orig_view = { orig_view_data, orig_view_bytes.size() / (Int64)sizeof(DataType) };
308 Int64 zci = index1 * extent.v;
309 Int64 z_index = i * extent.size();
310 for (Int32 z = 0, n = extent.v; z < n; ++z)
311 destination[z_index + z] = orig_view[zci + z];
312 };
313 }
314};
315
316/*---------------------------------------------------------------------------*/
317/*---------------------------------------------------------------------------*/
318
320{
321 public:
322
323 using InterfaceType = Arcane::Impl::ISpecificMemoryCopy;
324 template <typename DataType, typename Extent> using SpecificType = AcceleratorSpecificMemoryCopy<DataType, Extent>;
326};
327
328/*---------------------------------------------------------------------------*/
329/*---------------------------------------------------------------------------*/
330
334class AcceleratorSpecificMemoryCopyList
335: public Arcane::Impl::SpecificMemoryCopyList<AcceleratorIndexedCopyTraits>
336{
337 public:
338
339 AcceleratorSpecificMemoryCopyList();
340
345};
346
347/*---------------------------------------------------------------------------*/
348/*---------------------------------------------------------------------------*/
349
350} // namespace Arcane::Accelerator::Impl
351
352/*---------------------------------------------------------------------------*/
353/*---------------------------------------------------------------------------*/
354
355#endif
#define ARCCORE_THROW(exception_class,...)
Macro to throw an exception with formatting.
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
Types and macros for managing loops on accelerators.
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
1D loop on accelerator with additional arguments.
Management of references to a C++ class.
void _fill(const RunQueue *queue, SmallSpan< const Int32 > indexes, Span< const DataType > source, Span< DataType > destination)
Fills the values at indices specified by indexes.
Interface of a specialized memory copier for a given data size.
List of specialized ISpecificMemoryCopy instances.
View of an array of elements of type T.
Definition Span.h:805
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
Definition Span.h:539
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Definition Span.h:327
View of an array of elements of type T.
Definition Span.h:635
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
std::int64_t Int64
Signed integer type of 64 bits.
bool arccoreIsCheck()
True if in check mode.
std::int32_t Int32
Signed integer type of 32 bits.
Span< DataType > asSpan(Span< std::byte, Extent > bytes)
Converts a Span<std::byte> into a Span<DataType>.
Definition Span.h:1126