da/dc0/AcceleratorMemoryCopier_8h_source.html

// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-

//-----------------------------------------------------------------------------

// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)

// See the top-level COPYRIGHT file for details.

// SPDX-License-Identifier: Apache-2.0

//-----------------------------------------------------------------------------

/*---------------------------------------------------------------------------*/

/* AcceleratorMemoryCopier.h                                   (C) 2000-2026 */

/*                                                                           */

/* Implementation of memory copy functions on accelerators.                  */

/*---------------------------------------------------------------------------*/

#ifndef ARCCORE_ACCELERATOR_INTERNAL_ACCELERATORMEMORYCOPIER_H

#define ARCCORE_ACCELERATOR_INTERNAL_ACCELERATORMEMORYCOPIER_H

/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#include "arccore/base/Ref.h"

#include "arccore/base/FixedArray.h"

#include "arccore/base/NotSupportedException.h"


#include "arccore/common/accelerator/RunQueue.h"

#include "arccore/common/internal/SpecificMemoryCopyList.h"


#include "arccore/accelerator/RunCommandLoop.h"


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


namespace Arcane::Accelerator::Impl

{


using IndexedMemoryCopyArgs = Arcane::Impl::IndexedMemoryCopyArgs;

using IndexedMultiMemoryCopyArgs = Arcane::Impl::IndexedMultiMemoryCopyArgs;


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename DataType, typename Extent>


class AcceleratorSpecificMemoryCopy

: public Arcane::Impl::SpecificMemoryCopyBase<DataType, Extent>

{

  using BaseClass = Arcane::Impl::SpecificMemoryCopyBase<DataType, Extent>;

  using BaseClass::_toTrueType;


 public:


  using BaseClass::m_extent;


 public:


  void copyFrom(const IndexedMemoryCopyArgs& args) override

  {

    _copyFrom(args.m_queue, args.m_indexes, _toTrueType(args.m_source), _toTrueType(args.m_destination));

  }


  void copyTo(const IndexedMemoryCopyArgs& args) override

  {

    _copyTo(args.m_queue, args.m_indexes, _toTrueType(args.m_source), _toTrueType(args.m_destination));

  }


  void fill(const IndexedMemoryCopyArgs& args) override

  {

    _fill(args.m_queue, args.m_indexes, _toTrueType(args.m_source), _toTrueType(args.m_destination));

  }


  void copyFrom(const IndexedMultiMemoryCopyArgs& args) override

  {

    _copyFrom(args.m_queue, args.m_indexes, args.m_multi_memory, _toTrueType(args.m_source_buffer));

  }


  void copyTo(const IndexedMultiMemoryCopyArgs& args) override

  {

    _copyTo(args.m_queue, args.m_indexes, args.m_const_multi_memory, _toTrueType(args.m_destination_buffer));

  }


  void fill(const IndexedMultiMemoryCopyArgs& args) override

  {

    _fill(args.m_queue, args.m_indexes, args.m_multi_memory, _toTrueType(args.m_source_buffer));

  }


 public:


  void _copyFrom(const RunQueue* queue, SmallSpan<const Int32> indexes,

                 Span<const DataType> source, Span<DataType> destination)

  {

    ARCCORE_CHECK_POINTER(queue);


    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, indexes.data());

    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, source.data());

    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, destination.data());


    Int32 nb_index = indexes.size();

    const auto extent = m_extent;


    auto command = makeCommand(queue);

    command << RUNCOMMAND_LOOP1(iter, nb_index)

    {

      Int32 i = iter;

      Int64 zindex = i * extent.size();

      Int64 zci = indexes[i] * extent.size();

      for (Int32 z = 0; z < extent.v; ++z)

        destination[zindex + z] = source[zci + z];

    };

  }


  void _copyFrom(const RunQueue* queue, SmallSpan<const Int32> indexes, SmallSpan<Span<std::byte>> multi_views,

                 Span<const DataType> source)

  {

    ARCCORE_CHECK_POINTER(queue);

    if (arccoreIsCheck()) {

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, indexes.data());

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, source.data());

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, multi_views.data());

      // Ideally, we should test the values of the elements of multi_views

      // but if we do that, we can potentially perform transfers

      // between the accelerator and the CPU.

    }

    const Int32 nb_index = indexes.size() / 2;

    const auto extent = m_extent;


    auto command = makeCommand(queue);

    command << RUNCOMMAND_LOOP1(iter, nb_index)

    {

      auto [i] = iter();

      Int32 index0 = indexes[i * 2];

      Int64 index1 = indexes[(i * 2) + 1];

      Span<std::byte> orig_view_bytes = multi_views[index0];

      auto* orig_view_data = reinterpret_cast<DataType*>(orig_view_bytes.data());

      // Uses a span to test array overflows but

      // could directly use 'orig_view_data' for better performance

      Span<DataType> orig_view = { orig_view_data, orig_view_bytes.size() / (Int64)sizeof(DataType) };

      Int64 zci = index1 * extent.v;

      Int64 z_index = i * extent.size();

      for (Int32 z = 0, n = extent.v; z < n; ++z)

        orig_view[zci + z] = source[z_index + z];

    };

  }


  void _fill(const RunQueue* queue, SmallSpan<const Int32> indexes, Span<const DataType> source,

             Span<DataType> destination)

  {

    ARCCORE_CHECK_POINTER(queue);


    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, indexes.data());

    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, destination.data());

    ARCCORE_CHECK_ACCESSIBLE_POINTER(eExecutionPolicy::Sequential, source.data());


    Int32 nb_index = indexes.size();

    const auto extent = m_extent;

    constexpr Int32 max_size = 24;


    // For now, we limit the size of DataType hardcoded.

    // In the future, we should allocate on the device and deallocate at the end

    // of execution (via cudaMallocAsync/cudaFreeAsync to manage asynchronous operations)

    if (extent.v > max_size)

      ARCCORE_THROW(NotSupportedException, "sizeof(type) is too big (v={0} max={1})",

                    sizeof(DataType) * extent.v, sizeof(DataType) * max_size);

    FixedArray<DataType, max_size> local_source;

    for (Int32 z = 0; z < extent.v; ++z)

      local_source[z] = source[z];

    for (Int32 z = extent.v; z < max_size; ++z)

      local_source[z] = {};


    auto command = makeCommand(queue);

    // If \a nb_index is 0, we fill all elements

    if (nb_index == 0) {

      Int32 nb_value = CheckedConvert::toInt32(destination.size() / extent.v);

      command << RUNCOMMAND_LOOP1(iter, nb_value)

      {

        auto [i] = iter();

        Int64 zci = i * extent.size();

        for (Int32 z = 0; z < extent.v; ++z)

          destination[zci + z] = local_source[z];

      };

    }

    else {

      command << RUNCOMMAND_LOOP1(iter, nb_index)

      {

        auto [i] = iter();

        Int64 zci = indexes[i] * extent.size();

        for (Int32 z = 0; z < extent.v; ++z)

          destination[zci + z] = local_source[z];

      };

    }

  }


  void _fill(const RunQueue* queue, SmallSpan<const Int32> indexes, SmallSpan<Span<std::byte>> multi_views,

             Span<const DataType> source)

  {

    ARCCORE_CHECK_POINTER(queue);


    if (arccoreIsCheck()) {

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, indexes.data());

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(eExecutionPolicy::Sequential, source.data());

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, multi_views.data());

      // Ideally, we should test the values of the elements of multi_views

      // but if we do that, we can potentially perform transfers

      // between the accelerator and the CPU.

    }

    const Int32 nb_index = indexes.size() / 2;

    const auto extent = m_extent;

    constexpr Int32 max_size = 24;


    // For now, we limit the size of DataType hardcoded.

    // In the future, we should allocate on the device and deallocate at the end

    // of execution (via cudaMallocAsync/cudaFreeAsync to manage asynchronous operations)

    if (extent.v > max_size)

      ARCCORE_THROW(NotSupportedException, "sizeof(type) is too big (v={0} max={1})",

                    sizeof(DataType) * extent.v, sizeof(DataType) * max_size);

    FixedArray<DataType, max_size> local_source;

    for (Int32 z = 0; z < extent.v; ++z)

      local_source[z] = source[z];

    for (Int32 z = extent.v; z < max_size; ++z)

      local_source[z] = {};


    if (nb_index == 0) {

      // Fills all values of the array with the source.

      // Since the number of elements in the second dimension depends on the first,

      // we use a kernel per dimension.

      RunQueue q(*queue);

      RunQueue::ScopedAsync sc(&q);

      const Int32 nb_dim1 = multi_views.size();

      for (Int32 zz = 0; zz < nb_dim1; ++zz) {

        Span<DataType> orig_view = Arccore::asSpan<DataType>(multi_views[zz]);

        Int32 nb_value = CheckedConvert::toInt32(orig_view.size());

        auto command = makeCommand(queue);

        command << RUNCOMMAND_LOOP1(iter, nb_value)

        {

          auto [i] = iter();

          orig_view[i] = local_source[i % extent.v];

        };

      }

    }

    else {

      auto command = makeCommand(queue);

      command << RUNCOMMAND_LOOP1(iter, nb_index)

      {

        auto [i] = iter();

        Int32 index0 = indexes[i * 2];

        Int64 index1 = indexes[(i * 2) + 1];

        Span<std::byte> orig_view_bytes = multi_views[index0];

        auto* orig_view_data = reinterpret_cast<DataType*>(orig_view_bytes.data());

        // Uses a span to test array overflows but

        // could directly use 'orig_view_data' for better performance

        Span<DataType> orig_view = { orig_view_data, orig_view_bytes.size() / (Int64)sizeof(DataType) };

        Int64 zci = index1 * extent.v;

        for (Int32 z = 0, n = extent.v; z < n; ++z)

          orig_view[zci + z] = local_source[z];

      };

    }

  }


  void _copyTo(const RunQueue* queue, SmallSpan<const Int32> indexes, Span<const DataType> source,

               Span<DataType> destination)

  {

    ARCCORE_CHECK_POINTER(queue);


    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, indexes.data());

    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, source.data());

    ARCCORE_CHECK_ACCESSIBLE_POINTER(queue, destination.data());


    Int32 nb_index = indexes.size();

    const auto extent = m_extent;


    auto command = makeCommand(queue);

    command << RUNCOMMAND_LOOP1(iter, nb_index)

    {

      auto [i] = iter();

      Int64 zindex = i * extent.size();

      Int64 zci = indexes[i] * extent.size();

      for (Int32 z = 0; z < extent.v; ++z)

        destination[zci + z] = source[zindex + z];

    };

  }

  void _copyTo(const RunQueue* queue, SmallSpan<const Int32> indexes, SmallSpan<const Span<const std::byte>> multi_views,

               Span<DataType> destination)

  {

    ARCCORE_CHECK_POINTER(queue);


    if (arccoreIsCheck()) {

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, indexes.data());

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, destination.data());

      ARCCORE_CHECK_ACCESSIBLE_POINTER_ALWAYS(queue, multi_views.data());

      // Ideally, we should test the values of the elements of multi_views

      // but if we do that, we can potentially perform transfers

      // between the accelerator and the CPU.

    }


    const Int32 nb_index = indexes.size() / 2;

    const auto extent = m_extent;


    auto command = makeCommand(queue);

    command << RUNCOMMAND_LOOP1(iter, nb_index)

    {

      auto [i] = iter();

      Int32 index0 = indexes[i * 2];

      Int64 index1 = indexes[(i * 2) + 1];

      Span<const std::byte> orig_view_bytes = multi_views[index0];

      auto* orig_view_data = reinterpret_cast<const DataType*>(orig_view_bytes.data());

      // Uses a span to test array overflows but

      // could directly use 'orig_view_data' for better performance

      Span<const DataType> orig_view = { orig_view_data, orig_view_bytes.size() / (Int64)sizeof(DataType) };

      Int64 zci = index1 * extent.v;

      Int64 z_index = i * extent.size();

      for (Int32 z = 0, n = extent.v; z < n; ++z)

        destination[z_index + z] = orig_view[zci + z];

    };

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class AcceleratorIndexedCopyTraits

{

 public:


  using InterfaceType = Arcane::Impl::ISpecificMemoryCopy;

  template <typename DataType, typename Extent> using SpecificType = AcceleratorSpecificMemoryCopy<DataType, Extent>;

  using RefType = Arcane::Impl::SpecificMemoryCopyRef<AcceleratorIndexedCopyTraits>;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class AcceleratorSpecificMemoryCopyList

: public Arcane::Impl::SpecificMemoryCopyList<AcceleratorIndexedCopyTraits>

{

 public:


  AcceleratorSpecificMemoryCopyList();


  void addExplicitTemplate1();

  void addExplicitTemplate2();

  void addExplicitTemplate3();

  void addExplicitTemplate4();

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


} // namespace Arcane::Accelerator::Impl


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#endif

ARCCORE_THROW
#define ARCCORE_THROW(exception_class,...)
Macro to throw an exception with formatting.
Definition ArccoreGlobal.h:524

ARCCORE_CHECK_POINTER
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
Definition ArccoreGlobal.h:797

RunCommandLoop.h
Types and macros for managing loops on accelerators.

RUNCOMMAND_LOOP1
#define RUNCOMMAND_LOOP1(iter_name, x1,...)
1D loop on accelerator with additional arguments.
Definition arccore/src/accelerator/arccore/accelerator/RunCommandLoop.h:474

Ref.h
Management of references to a C++ class.

Arcane::Accelerator::Impl::AcceleratorIndexedCopyTraits
Definition AcceleratorMemoryCopier.h:320

Arcane::Accelerator::Impl::AcceleratorSpecificMemoryCopyList::addExplicitTemplate3
void addExplicitTemplate3()
Definition MemoryCopierTpl3.cc:23

Arcane::Accelerator::Impl::AcceleratorSpecificMemoryCopyList::addExplicitTemplate1
void addExplicitTemplate1()
Definition MemoryCopierTpl1.cc:23

Arcane::Accelerator::Impl::AcceleratorSpecificMemoryCopyList::addExplicitTemplate4
void addExplicitTemplate4()
Definition MemoryCopierTpl4.cc:23

Arcane::Accelerator::Impl::AcceleratorSpecificMemoryCopyList::addExplicitTemplate2
void addExplicitTemplate2()
Definition MemoryCopierTpl2.cc:26

Arcane::Accelerator::Impl::AcceleratorSpecificMemoryCopy
Definition AcceleratorMemoryCopier.h:41

Arcane::Accelerator::Impl::AcceleratorSpecificMemoryCopy::_fill
void _fill(const RunQueue *queue, SmallSpan< const Int32 > indexes, Span< const DataType > source, Span< DataType > destination)
Fills the values at indices specified by indexes.
Definition AcceleratorMemoryCopier.h:144

Arcane::Accelerator::RunQueue::ScopedAsync
Definition arccore/src/common/arccore/common/accelerator/RunQueue.h:71

Arcane::Accelerator::RunQueue
Execution queue for an accelerator.
Definition arccore/src/common/arccore/common/accelerator/RunQueue.h:53

Arcane::FixedArray
Fixed-size 1D array.
Definition arccore/src/base/arccore/base/FixedArray.h:46

Arcane::Impl::ISpecificMemoryCopy
Interface of a specialized memory copier for a given data size.
Definition SpecificMemoryCopyList.h:111

Arcane::Impl::SpecificMemoryCopyBase
Definition SpecificMemoryCopyList.h:280

Arcane::Impl::SpecificMemoryCopyList
List of specialized ISpecificMemoryCopy instances.
Definition SpecificMemoryCopyList.h:159

Arcane::Impl::SpecificMemoryCopyRef
Reference to a copier.
Definition SpecificMemoryCopyList.h:314

Arcane::NotSupportedException
Exception when an operation is not supported.
Definition arccore/src/base/arccore/base/NotSupportedException.h:34

Arcane::SmallSpan
View of an array of elements of type T.
Definition Span.h:805

Arcane::SpanImpl::data
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
Definition Span.h:539

Arcane::SpanImpl::size
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Definition Span.h:327

Arcane::Span
View of an array of elements of type T.
Definition Span.h:635

Arcane::Accelerator::makeCommand
RunCommand makeCommand(const RunQueue &run_queue)
Creates a command associated with the queue run_queue.
Definition arccore/src/common/arccore/common/accelerator/RunQueue.h:289

Arcane::Accelerator::eExecutionPolicy::Sequential
@ Sequential
Sequential execution policy.
Definition CommonAcceleratorGlobal.h:94

Arcane::Int64
std::int64_t Int64
Signed integer type of 64 bits.
Definition ArccoreGlobal.h:235

Arcane::arccoreIsCheck
bool arccoreIsCheck()
True if in check mode.
Definition ArccoreGlobal.cc:80

Arcane::Int32
std::int32_t Int32
Signed integer type of 32 bits.
Definition ArccoreGlobal.h:233

Arccore::asSpan
Span< DataType > asSpan(Span< std::byte, Extent > bytes)
Converts a Span<std::byte> into a Span<DataType>.
Definition Span.h:1126