db/ddc/RunCommandLaunchImpl_8h_source.html

// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-

//-----------------------------------------------------------------------------

// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)

// See the top-level COPYRIGHT file for details.

// SPDX-License-Identifier: Apache-2.0

//-----------------------------------------------------------------------------

/*---------------------------------------------------------------------------*/

/* RunCommandLaunchImpl.h                                      (C) 2000-2026 */

/*                                                                           */

/* Implémentation d'une RunCommand pour le parallélisme hiérarchique.        */

/*---------------------------------------------------------------------------*/

#ifndef ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H

#define ARCCORE_ACCELERATOR_RUNCOMMANDLAUNCHIMPL_H

/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#include "AcceleratorGlobal.h"

#include "arccore/common/SequentialFor.h"

#include "arccore/common/StridedLoopRanges.h"

#include "arccore/common/accelerator/RunCommand.h"

#include "arccore/concurrency/ParallelFor.h"


#include "arccore/accelerator/WorkGroupLoopRange.h"

#include "arccore/accelerator/CooperativeWorkGroupLoopRange.h"

#include "arccore/accelerator/KernelLauncher.h"


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


namespace Arcane::Accelerator::Impl

{


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename IndexType_>


class HostLaunchLoopRangeBase

{

 public:


  using IndexType = IndexType_;


 public:


  ARCCORE_ACCELERATOR_EXPORT

  HostLaunchLoopRangeBase(IndexType total_size, Int32 nb_group, IndexType block_size);


 public:


  constexpr IndexType nbElement() const { return m_total_size; }

  constexpr IndexType blockSize() const { return m_block_size; }

  constexpr Int32 nbBlock() const { return m_nb_block; }

  constexpr IndexType lastBlockSize() const { return m_last_block_size; }


  constexpr IndexType nbActiveItem(Int32 i) const

  {

    return ((i + 1) != m_nb_block) ? m_block_size : m_last_block_size;

  }


  ThreadGridSynchronizer* threadGridSynchronizer() const

  {

    return m_thread_grid_synchronizer;

  }


  void setThreadGridSynchronizer(ThreadGridSynchronizer* v)

  {

    m_thread_grid_synchronizer = v;

  }


 private:


  ThreadGridSynchronizer* m_thread_grid_synchronizer = nullptr;

  IndexType m_total_size = 0;

  IndexType m_block_size = 0;

  IndexType m_last_block_size = 0;

  Int32 m_nb_block = 0;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename WorkGroupLoopRangeType_>


class HostLaunchLoopRange

: public HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>

{

 public:


  using WorkGroupLoopRangeType = WorkGroupLoopRangeType_;

  using IndexType = typename WorkGroupLoopRangeType_::IndexType;

  using BaseClass = HostLaunchLoopRangeBase<typename WorkGroupLoopRangeType_::IndexType>;


 public:


  explicit HostLaunchLoopRange(const WorkGroupLoopRangeType& bounds)

  : BaseClass(bounds.nbElement(), bounds.nbBlock(), bounds.blockSize())

  {

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class WorkGroupLoopContextBuilder

{

 public:


#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)


  template <typename IndexType_> static constexpr ARCCORE_HOST_DEVICE WorkGroupLoopContext<IndexType_>

  build(const WorkGroupLoopRange<IndexType_>& loop_range)

  {

    return WorkGroupLoopContext<IndexType_>(loop_range.nbElement());

  }


  template <typename IndexType_> static constexpr ARCCORE_HOST_DEVICE CooperativeWorkGroupLoopContext<IndexType_>

  build(const CooperativeWorkGroupLoopRange<IndexType_>& loop_range)

  {

    return CooperativeWorkGroupLoopContext<IndexType_>(loop_range.nbElement());

  }


#endif


#if defined(ARCCORE_COMPILING_SYCL)


  template <typename IndexType_> static SyclWorkGroupLoopContext<IndexType_>

  build(const WorkGroupLoopRange<IndexType_>& loop_range, sycl::nd_item<1> id)

  {

    return SyclWorkGroupLoopContext<IndexType_>(id, loop_range.nbElement());

  }


  template <typename IndexType_> static SyclCooperativeWorkGroupLoopContext<IndexType_>

  build(const CooperativeWorkGroupLoopRange<IndexType_>& loop_range, sycl::nd_item<1> id)

  {

    return SyclCooperativeWorkGroupLoopContext<IndexType_>(id, loop_range.nbElement());

  }

#endif

};


#if defined(ARCCORE_COMPILING_SYCL)


// Pour indiquer qu'il faut toujours utiliser sycl::nd_item (et jamais sycl::id)

// comme argument avec 'WorkGroupLoopRange.

template <typename IndexType_>

class IsAlwaysUseSyclNdItem<StridedLoopRanges<WorkGroupLoopRange<IndexType_>>>

: public std::true_type

{

};

// Pour indiquer qu'il faut toujours utiliser sycl::nd_item (et jamais sycl::id)

// comme argument avec 'CooperativeWorkGroupLoopRange.

template <typename IndexType_>

class IsAlwaysUseSyclNdItem<StridedLoopRanges<CooperativeWorkGroupLoopRange<IndexType_>>>

: public std::true_type

{

};


#endif


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class WorkGroupSequentialForHelper

{

 public:


  template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> static void


  apply(Int32 begin_index, Int32 nb_loop, HostLaunchLoopRange<LoopBoundType> bounds,

        const Lambda& func, RemainingArgs... remaining_args)

  {

    using LoopIndexType = LoopBoundType::LoopIndexType;

    ::Arcane::Impl::HostKernelRemainingArgsHelper::applyAtBegin(remaining_args...);

    const Int32 group_size = bounds.blockSize();

    Int32 loop_index = begin_index * group_size;

    for (Int32 i = begin_index; i < (begin_index + nb_loop); ++i) {

      // Pour la dernière itération de la boucle, le nombre d'éléments actifs peut-être

      // inférieur à la taille d'un groupe si \a total_nb_element n'est pas

      // un multiple de \a group_size.

      Int32 nb_active = bounds.nbActiveItem(i);

      LoopIndexType li(loop_index, i, group_size, nb_active, bounds.nbElement(), bounds.nbBlock(), bounds.threadGridSynchronizer());

      func(li, remaining_args...);

      loop_index += group_size;

    }


    ::Arcane::Impl::HostKernelRemainingArgsHelper::applyAtEnd(remaining_args...);

  }


};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#if defined(ARCCORE_COMPILING_CUDA_OR_HIP)


// On utilise 'Argument dependent lookup' pour trouver 'arcaneGetLoopIndexCudaHip'

template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> __global__ static void

doHierarchicalLaunchCudaHip(LoopBoundType bounds, Lambda func, RemainingArgs... remaining_args)

{

  Int32 i = blockDim.x * blockIdx.x + threadIdx.x;


  CudaHipKernelRemainingArgsHelper::applyAtBegin(i, remaining_args...);

  // TODO: regarder s'il faut faire ce test

  if (i < bounds.nbOriginalElement()) {

    func(WorkGroupLoopContextBuilder::build(bounds.originalLoop()), remaining_args...);

  }

  CudaHipKernelRemainingArgsHelper::applyAtEnd(i, remaining_args...);

};


#endif


#if defined(ARCCORE_COMPILING_SYCL)


template <typename LoopBoundType, typename Lambda, typename... RemainingArgs>

class doHierarchicalLaunchSycl

{

 public:


  void operator()(sycl::nd_item<1> x, SmallSpan<std::byte> shared_memory,

                  LoopBoundType bounds, Lambda func,

                  RemainingArgs... remaining_args) const

  {

    Int32 i = static_cast<Int32>(x.get_global_id(0));

    SyclKernelRemainingArgsHelper::applyAtBegin(x, shared_memory, remaining_args...);

    // TODO: regarder s'il faut faire ce test

    if (i < bounds.nbOriginalElement()) {

      func(WorkGroupLoopContextBuilder::build(bounds.originalLoop(), x), remaining_args...);

    }

    SyclKernelRemainingArgsHelper::applyAtEnd(x, shared_memory, remaining_args...);

  }

};


#endif


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void

_doHierarchicalLaunch(RunCommand& command, LoopBoundType bounds,

                      const Lambda& func, const RemainingArgs&... other_args)

{

  Int64 nb_orig_element = bounds.nbElement();

  if (nb_orig_element == 0)

    return;

  const eExecutionPolicy exec_policy = command.executionPolicy();

  // En mode coopératif, il faut toujours appeler setBlockSize()

  // pour être certain que la taille de bloc est cohérente sur l'hôte

  // (en séquentiel, il ne faut qu'un seul bloc dans ce cas).

  if ((bounds.blockSize() == 0) || bounds.isCooperativeLaunch())

    bounds.setBlockSize(command);

  using TrueLoopBoundType = StridedLoopRanges<LoopBoundType>;

  TrueLoopBoundType bounds2(bounds);

  if (isAcceleratorPolicy(exec_policy)) {

    command.addNbThreadPerBlock(bounds.blockSize());

    bounds2.setNbStride(command.nbStride());

  }


  using HostLoopBoundType = HostLaunchLoopRange<LoopBoundType>;


  Impl::RunCommandLaunchInfo launch_info(command, bounds2.strideValue(), bounds.isCooperativeLaunch());

  launch_info.beginExecute();

  switch (exec_policy) {

  case eExecutionPolicy::CUDA:

    ARCCORE_KERNEL_CUDA_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),

                             launch_info, func, bounds2, other_args...);

    break;

  case eExecutionPolicy::HIP:

    ARCCORE_KERNEL_HIP_FUNC((Impl::doHierarchicalLaunchCudaHip<TrueLoopBoundType, Lambda, RemainingArgs...>),

                            launch_info, func, bounds2, other_args...);

    break;

  case eExecutionPolicy::SYCL:

    ARCCORE_KERNEL_SYCL_FUNC((Impl::doHierarchicalLaunchSycl<TrueLoopBoundType, Lambda, RemainingArgs...>{}),

                             launch_info, func, bounds2, other_args...);

    break;

  case eExecutionPolicy::Sequential: {

    HostLoopBoundType host_bounds(bounds);

    arccoreSequentialFor(host_bounds, func, other_args...);

  } break;

  case eExecutionPolicy::Thread: {

    HostLoopBoundType host_bounds(bounds);

    arccoreParallelFor(host_bounds, launch_info.loopRunInfo(), func, other_args...);

  } break;

  default:

    ARCCORE_FATAL("Invalid execution policy '{0}'", exec_policy);

  }

  launch_info.endExecute();

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename LoopBoundType, typename... RemainingArgs>


class ExtendedLaunchRunCommand

{

 public:


  ExtendedLaunchRunCommand(RunCommand& command, const LoopBoundType& bounds)

  : m_command(command)

  , m_bounds(bounds)

  {

  }

  ExtendedLaunchRunCommand(RunCommand& command, const LoopBoundType& bounds, const std::tuple<RemainingArgs...>& args)

  : m_command(command)

  , m_bounds(bounds)

  , m_remaining_args(args)

  {

  }

  RunCommand& m_command;

  LoopBoundType m_bounds;

  std::tuple<RemainingArgs...> m_remaining_args;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename LoopBoundType, typename... RemainingArgs>


class ExtendedLaunchLoop

{

 public:


  ExtendedLaunchLoop(const LoopBoundType& bounds, RemainingArgs... args)

  : m_bounds(bounds)

  , m_remaining_args(args...)

  {

  }

  LoopBoundType m_bounds;

  std::tuple<RemainingArgs...> m_remaining_args;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename LoopBoundType, typename... RemainingArgs> auto

makeLaunch(const LoopBoundType& bounds, RemainingArgs... args)

-> ExtendedLaunchLoop<LoopBoundType, RemainingArgs...>

{

  return ExtendedLaunchLoop<LoopBoundType, RemainingArgs...>(bounds, args...);

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void

operator<<(ExtendedLaunchRunCommand<LoopBoundType, RemainingArgs...>&& nr, const Lambda& f)

{

  if constexpr (sizeof...(RemainingArgs) > 0) {

    std::apply([&](auto... vs) { _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f, vs...); }, nr.m_remaining_args);

  }

  else {

    _doHierarchicalLaunch(nr.m_command, nr.m_bounds, f);

  }

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void

arccoreSequentialFor(HostLaunchLoopRange<LoopBoundType> bounds, const Lambda& func, const RemainingArgs&... remaining_args)

{

  WorkGroupSequentialForHelper::apply(0, bounds.nbBlock(), bounds, func, remaining_args...);

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <typename LoopBoundType, typename Lambda, typename... RemainingArgs> void

arccoreParallelFor(HostLaunchLoopRange<LoopBoundType> bounds, ForLoopRunInfo run_info,

                   const Lambda& func, const RemainingArgs&... remaining_args)

{

  Int32 nb_thread = run_info.options().value().maxThread();

  ThreadGridSynchronizer grid_sync(nb_thread);

  bounds.setThreadGridSynchronizer(&grid_sync);

  auto sub_func = [=](Int32 begin_index, Int32 nb_loop) {

    Impl::WorkGroupSequentialForHelper::apply(begin_index, nb_loop, bounds, func, remaining_args...);

  };

  ::Arcane::arccoreParallelFor(0, bounds.nbBlock(), run_info, sub_func);

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


} // namespace Arcane::Accelerator::Impl


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#endif


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

ARCCORE_FATAL
#define ARCCORE_FATAL(...)
Macro envoyant une exception FatalErrorException.
Definition ArccoreGlobal.h:532

Arcane::Accelerator::CooperativeWorkGroupLoopContext
Contexte d'exécution d'une commande sur un ensemble de blocs.
Definition CooperativeWorkGroupLoopRange.h:122

Arcane::Accelerator::CooperativeWorkGroupLoopRange
Intervalle d'itération d'une boucle utilisant le parallélisme hiérarchique collaboratif.
Definition CooperativeWorkGroupLoopRange.h:253

Arcane::Accelerator::Impl::CudaHipKernelRemainingArgsHelper::applyAtEnd
static ARCCORE_DEVICE void applyAtEnd(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en fin de kernel.
Definition arccore/src/accelerator/arccore/accelerator/KernelLauncher.h:75

Arcane::Accelerator::Impl::CudaHipKernelRemainingArgsHelper::applyAtBegin
static ARCCORE_DEVICE void applyAtBegin(Int32 index, RemainingArgs &... remaining_args)
Applique les fonctors des arguments additionnels en début de kernel.
Definition arccore/src/accelerator/arccore/accelerator/KernelLauncher.h:68

Arcane::Accelerator::Impl::ExtendedLaunchLoop
Classe pour gérer le lancement d'un noyau de calcul hiérarchique.
Definition RunCommandLaunchImpl.h:342

Arcane::Accelerator::Impl::HostLaunchLoopRangeBase< typename WorkGroupLoopRangeType_::IndexType >::m_thread_grid_synchronizer
ThreadGridSynchronizer * m_thread_grid_synchronizer
Definition RunCommandLaunchImpl.h:79

Arcane::Accelerator::Impl::HostLaunchLoopRangeBase::nbActiveItem
constexpr IndexType nbActiveItem(Int32 i) const
Nombre d'éléments actifs pour le i-ème bloc.
Definition RunCommandLaunchImpl.h:62

Arcane::Accelerator::Impl::HostLaunchLoopRangeBase::threadGridSynchronizer
ThreadGridSynchronizer * threadGridSynchronizer() const
Synchronizer de la grille (non nul uniquement en multi-thread coopératif)
Definition RunCommandLaunchImpl.h:67

Arcane::Accelerator::Impl::HostLaunchLoopRangeBase::nbElement
constexpr IndexType nbElement() const
Nombre d'éléments à traiter.
Definition RunCommandLaunchImpl.h:54

Arcane::Accelerator::Impl::HostLaunchLoopRangeBase::lastBlockSize
constexpr IndexType lastBlockSize() const
Nombre d'éléments du dernier bloc.
Definition RunCommandLaunchImpl.h:60

Arcane::Accelerator::Impl::HostLaunchLoopRangeBase::blockSize
constexpr IndexType blockSize() const
Taille d'un bloc.
Definition RunCommandLaunchImpl.h:56

Arcane::Accelerator::Impl::HostLaunchLoopRangeBase::nbBlock
constexpr Int32 nbBlock() const
Nombre de blocs.
Definition RunCommandLaunchImpl.h:58

Arcane::Accelerator::Impl::HostLaunchLoopRange
Definition RunCommandLaunchImpl.h:92

Arcane::Accelerator::Impl::IsAlwaysUseSyclNdItem
Template pour savoir si un type utilisé comme boucle dans les kernels nécessite toujours sycl::nb_ite...
Definition arccore/src/accelerator/arccore/accelerator/AcceleratorGlobal.h:50

Arcane::Accelerator::Impl::StridedLoopRanges
Classe pour gérer la décomposition d'une boucle en plusieurs parties.
Definition StridedLoopRanges.h:84

Arcane::Accelerator::Impl::ThreadGridSynchronizer
Classe pour gérer la synchronisation de grille en multi-thread;.
Definition arccore/src/accelerator/arccore/accelerator/WorkGroupLoopRange.h:43

Arcane::Accelerator::Impl::WorkGroupLoopContextBuilder
Definition RunCommandLaunchImpl.h:111

Arcane::Accelerator::Impl::WorkGroupSequentialForHelper
Classe pour exécuter en séquentiel sur l'hôte une partie de la boucle.
Definition RunCommandLaunchImpl.h:173

Arcane::Accelerator::Impl::WorkGroupSequentialForHelper::apply
static void apply(Int32 begin_index, Int32 nb_loop, HostLaunchLoopRange< LoopBoundType > bounds, const Lambda &func, RemainingArgs... remaining_args)
Applique le fonctor func sur une boucle séqentielle.
Definition RunCommandLaunchImpl.h:178

Arcane::Accelerator::RunCommand
Gestion d'une commande sur accélérateur.
Definition arccore/src/common/arccore/common/accelerator/RunCommand.h:46

Arcane::Accelerator::SyclCooperativeWorkGroupLoopContext
Definition arccore/src/accelerator/arccore/accelerator/WorkGroupLoopRange.h:101

Arcane::Accelerator::SyclWorkGroupLoopContext
Definition arccore/src/accelerator/arccore/accelerator/WorkGroupLoopRange.h:99

Arcane::Accelerator::WorkGroupLoopContext
Contexte d'exécution d'une commande sur un ensemble de blocs.
Definition arccore/src/accelerator/arccore/accelerator/WorkGroupLoopRange.h:540

Arcane::Accelerator::WorkGroupLoopRangeBase::nbElement
constexpr IndexType nbElement() const
Nombre d'éléments à traiter.
Definition arccore/src/accelerator/arccore/accelerator/WorkGroupLoopRange.h:787

Arcane::Accelerator::WorkGroupLoopRange
Intervalle d'itération d'une boucle utilisant le parallélisme hiérarchique.
Definition arccore/src/accelerator/arccore/accelerator/WorkGroupLoopRange.h:828

Arcane::Impl::HostKernelRemainingArgsHelper::applyAtEnd
static void applyAtEnd(RemainingArgs &... remaining_args)
Applique les functors des arguments additionnels à la fin de l'itération.
Definition HostKernelRemainingArgsHelper.h:45

Arcane::Impl::HostKernelRemainingArgsHelper::applyAtBegin
static void applyAtBegin(RemainingArgs &... remaining_args)
Applique les functors des arguments additionnels au début de l'itération.
Definition HostKernelRemainingArgsHelper.h:38

Arcane::Accelerator::eExecutionPolicy
eExecutionPolicy
Politique d'exécution pour un Runner.
Definition CommonAcceleratorGlobal.h:88

Arcane::Accelerator::eExecutionPolicy::SYCL
@ SYCL
Politique d'exécution utilisant l'environnement SYCL.
Definition CommonAcceleratorGlobal.h:100

Arcane::Accelerator::eExecutionPolicy::HIP
@ HIP
Politique d'exécution utilisant l'environnement HIP.
Definition CommonAcceleratorGlobal.h:98

Arcane::Accelerator::eExecutionPolicy::CUDA
@ CUDA
Politique d'exécution utilisant l'environnement CUDA.
Definition CommonAcceleratorGlobal.h:96

Arcane::Accelerator::eExecutionPolicy::Sequential
@ Sequential
Politique d'exécution séquentielle.
Definition CommonAcceleratorGlobal.h:92

Arcane::Accelerator::eExecutionPolicy::Thread
@ Thread
Politique d'exécution multi-thread.
Definition CommonAcceleratorGlobal.h:94

Arcane::Accelerator::isAcceleratorPolicy
bool isAcceleratorPolicy(eExecutionPolicy exec_policy)
Indique si exec_policy correspond à un accélérateur.
Definition CommonAcceleratorGlobal.h:191

Arcane::Int64
std::int64_t Int64
Type entier signé sur 64 bits.
Definition ArccoreGlobal.h:227

Arcane::arccoreParallelFor
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applique en concurrence la fonction lambda lambda_function sur l'intervalle d'itération donné par loo...
Definition ParallelFor.h:85

Arcane::Int32
std::int32_t Int32
Type entier signé sur 32 bits.
Definition ArccoreGlobal.h:225