da/dca/CudaAcceleratorRuntime_8cc_source.html

// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-

//-----------------------------------------------------------------------------

// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)

// See the top-level COPYRIGHT file for details.

// SPDX-License-Identifier: Apache-2.0

//-----------------------------------------------------------------------------

/*---------------------------------------------------------------------------*/

/* CudaAcceleratorRuntime.cc                                   (C) 2000-2026 */

/*                                                                           */

/* Runtime for 'Cuda'.                                                       */

/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#include "arccore/accelerator_native/CudaAccelerator.h"


#include "arccore/base/CheckedConvert.h"

#include "arccore/base/FatalErrorException.h"


#include "arccore/common/internal/MemoryUtilsInternal.h"

#include "arccore/common/internal/IMemoryResourceMngInternal.h"


#include "arccore/common/accelerator/RunQueueBuildInfo.h"

#include "arccore/common/accelerator/Memory.h"

#include "arccore/common/accelerator/DeviceInfoList.h"

#include "arccore/common/accelerator/KernelLaunchArgs.h"

#include "arccore/common/accelerator/RunQueue.h"

#include "arccore/common/accelerator/DeviceMemoryInfo.h"

#include "arccore/common/accelerator/NativeStream.h"

#include "arccore/common/accelerator/internal/IRunnerRuntime.h"

#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"

#include "arccore/common/accelerator/internal/RunCommandImpl.h"

#include "arccore/common/accelerator/internal/IRunQueueStream.h"

#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"

#include "arccore/common/accelerator/internal/AcceleratorMemoryAllocatorBase.h"


#include "arccore/accelerator_native/runtime/Cupti.h"


#include <sstream>

#include <unordered_map>

#include <mutex>

#include <algorithm>

#include <iostream>


#include <cuda.h>


// For std::memset

#include <cstring>


#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT

#include <nvtx3/nvToolsExt.h>

#endif


namespace Arcane::Accelerator::Cuda

{

using Impl::KernelLaunchArgs;


namespace

{

  Int32 global_cupti_flush = 0;

  CuptiInfo global_cupti_info;

} // namespace


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


// Starting from CUDA 13, there is a new cudaMemLocation type

// for methods such as cudeMemAdvise or cudaMemPrefetch

#if defined(ARCCORE_USING_CUDA13_OR_GREATER)

inline cudaMemLocation

_getMemoryLocation(int device_id)

{

  cudaMemLocation mem_location;

  mem_location.type = cudaMemLocationTypeDevice;

  mem_location.id = device_id;

  if (device_id == cudaCpuDeviceId)

    mem_location.type = cudaMemLocationTypeHost;

  else {

    mem_location.type = cudaMemLocationTypeDevice;

    mem_location.id = device_id;

  }

  return mem_location;

}

#else

inline int

_getMemoryLocation(int device_id)

{

  return device_id;

}

#endif


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class ConcreteAllocator

{

 public:


  virtual ~ConcreteAllocator() = default;


 public:


  virtual cudaError_t _allocate(void** ptr, size_t new_size) = 0;

  virtual cudaError_t _deallocate(void* ptr) = 0;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <typename ConcreteAllocatorType>


class UnderlyingAllocator

: public AcceleratorMemoryAllocatorBase::IUnderlyingAllocator

{

 public:


  UnderlyingAllocator() = default;


 public:


  void* allocateMemory(Int64 size) final

  {

    void* out = nullptr;

    ARCCORE_CHECK_CUDA(m_concrete_allocator._allocate(&out, size));

    return out;

  }


  void freeMemory(void* ptr, [[maybe_unused]] Int64 size) final

  {

    ARCCORE_CHECK_CUDA_NOTHROW(m_concrete_allocator._deallocate(ptr));

  }


  void doMemoryCopy(void* destination, const void* source, Int64 size) final

  {

    ARCCORE_CHECK_CUDA(cudaMemcpy(destination, source, size, cudaMemcpyDefault));

  }


  eMemoryResource memoryResource() const final

  {

    return m_concrete_allocator.memoryResource();

  }


 public:


  ConcreteAllocatorType m_concrete_allocator;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class UnifiedMemoryConcreteAllocator

: public ConcreteAllocator

{

 public:


  UnifiedMemoryConcreteAllocator()

  {

    if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))

      m_use_ats = v.value();

    if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MEMORY_HINT_ON_DEVICE", true))

      m_use_hint_as_mainly_device = (v.value() != 0);

  }


  cudaError_t _deallocate(void* ptr) final

  {

    if (m_use_ats) {

      ::free(ptr);

      return cudaSuccess;

    }

    //std::cout << "CUDA_MANAGED_FREE ptr=" << ptr << "\n";

    return ::cudaFree(ptr);

  }


  cudaError_t _allocate(void** ptr, size_t new_size) final

  {

    if (m_use_ats) {

      *ptr = ::aligned_alloc(128, new_size);

    }

    else {

      auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);

      //std::cout << "CUDA_MANAGED_MALLOC ptr=" << (*ptr) << " size=" << new_size << "\n";

      //if (new_size < 4000)

      //std::cout << "STACK=" << platform::getStackTrace() << "\n";


      if (r != cudaSuccess)

        return r;


      // If requested, indicates that we prefer to allocate on the GPU.

      // NOTE: In this case, we retrieve the current device to position the

      // preferred location. If we use MemoryPool, this allocation will only

      // be performed once. If the default device for a thread changes during

      // computation, there will be an inconsistency. To avoid this, we could

      // call cudaMemAdvise() for each allocation (via _applyHint()) but these

      // operations are quite costly and if there are many allocations, a

      // performance loss may result.

      if (m_use_hint_as_mainly_device) {

        int device_id = 0;

        void* p = *ptr;

        cudaGetDevice(&device_id);

        ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, _getMemoryLocation(device_id)));

        ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, _getMemoryLocation(cudaCpuDeviceId)));

      }

    }


    return cudaSuccess;

  }


  constexpr eMemoryResource memoryResource() const { return eMemoryResource::UnifiedMemory; }


 public:


  bool m_use_ats = false;

  bool m_use_hint_as_mainly_device = false;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class UnifiedMemoryCudaMemoryAllocator

: public AcceleratorMemoryAllocatorBase

{

 public:

 public:


  UnifiedMemoryCudaMemoryAllocator()

  : AcceleratorMemoryAllocatorBase("UnifiedMemoryCudaMemory", new UnderlyingAllocator<UnifiedMemoryConcreteAllocator>())

  {

    if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_MALLOC_TRACE", true))

      _setTraceLevel(v.value());

  }


  void initialize()

  {

    _doInitializeUVM(true);

  }


 public:


  void notifyMemoryArgsChanged([[maybe_unused]] MemoryAllocationArgs old_args,

                               MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final

  {

    void* p = ptr.baseAddress();

    Int64 s = ptr.capacity();

    if (p && s > 0)

      _applyHint(ptr.baseAddress(), ptr.size(), new_args);

  }


 protected:


  void _applyHint(void* p, size_t new_size, MemoryAllocationArgs args)

  {

    eMemoryLocationHint hint = args.memoryLocationHint();

    // Uses the active device to position the GPU by default

    // We only do this if the hint requires it to avoid calling

    // cudaGetDevice() every time.

    int device_id = 0;

    if (hint == eMemoryLocationHint::MainlyDevice || hint == eMemoryLocationHint::HostAndDeviceMostlyRead) {

      cudaGetDevice(&device_id);

    }

    auto device_memory_location = _getMemoryLocation(device_id);

    auto cpu_memory_location = _getMemoryLocation(cudaCpuDeviceId);


    //std::cout << "SET_MEMORY_HINT name=" << args.arrayName() << " size=" << new_size << " hint=" << (int)hint << "\n";

    if (hint == eMemoryLocationHint::MainlyDevice || hint == eMemoryLocationHint::HostAndDeviceMostlyRead) {

      ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_memory_location));

      ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cpu_memory_location));

    }

    if (hint == eMemoryLocationHint::MainlyHost) {

      ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cpu_memory_location));

      //ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, 0));

    }

    if (hint == eMemoryLocationHint::HostAndDeviceMostlyRead) {

      ARCCORE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_memory_location));

    }

  }

  void _removeHint(void* p, size_t size, MemoryAllocationArgs args)

  {

    eMemoryLocationHint hint = args.memoryLocationHint();

    if (hint == eMemoryLocationHint::None)

      return;

    int device_id = 0;

    ARCCORE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, _getMemoryLocation(device_id)));

  }


 private:


  bool m_use_ats = false;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class HostPinnedConcreteAllocator

: public ConcreteAllocator

{

 public:


  cudaError_t _allocate(void** ptr, size_t new_size) final

  {

    return ::cudaMallocHost(ptr, new_size);

  }

  cudaError_t _deallocate(void* ptr) final

  {

    return ::cudaFreeHost(ptr);

  }

  constexpr eMemoryResource memoryResource() const { return eMemoryResource::HostPinned; }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class HostPinnedCudaMemoryAllocator

: public AcceleratorMemoryAllocatorBase

{

 public:

 public:


  HostPinnedCudaMemoryAllocator()

  : AcceleratorMemoryAllocatorBase("HostPinnedCudaMemory", new UnderlyingAllocator<HostPinnedConcreteAllocator>())

  {

  }


 public:


  void initialize()

  {

    _doInitializeHostPinned(true);

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class DeviceConcreteAllocator

: public ConcreteAllocator

{

 public:


  DeviceConcreteAllocator()

  {

    if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUDA_USE_ALLOC_ATS", true))

      m_use_ats = v.value();

  }


  cudaError_t _allocate(void** ptr, size_t new_size) final

  {

    if (m_use_ats) {

      // FIXME: it does not work on WIN32

      *ptr = std::aligned_alloc(128, new_size);

      if (*ptr)

        return cudaSuccess;

      return cudaErrorMemoryAllocation;

    }

    cudaError_t r = ::cudaMalloc(ptr, new_size);

    //std::cout << "ALLOCATE_DEVICE ptr=" << (*ptr) << " size=" << new_size << " r=" << (int)r << "\n";

    return r;

  }

  cudaError_t _deallocate(void* ptr) final

  {

    if (m_use_ats) {

      std::free(ptr);

      return cudaSuccess;

    }

    //std::cout << "FREE_DEVICE ptr=" << ptr << "\n";

    return ::cudaFree(ptr);

  }


  constexpr eMemoryResource memoryResource() const { return eMemoryResource::Device; }


 private:


  bool m_use_ats = false;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class DeviceCudaMemoryAllocator

: public AcceleratorMemoryAllocatorBase

{


 public:


  DeviceCudaMemoryAllocator()

  : AcceleratorMemoryAllocatorBase("DeviceCudaMemoryAllocator", new UnderlyingAllocator<DeviceConcreteAllocator>())

  {

  }


 public:


  void initialize()

  {

    _doInitializeDevice(true);

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


namespace

{

  UnifiedMemoryCudaMemoryAllocator unified_memory_cuda_memory_allocator;

  HostPinnedCudaMemoryAllocator host_pinned_cuda_memory_allocator;

  DeviceCudaMemoryAllocator device_cuda_memory_allocator;

} // namespace


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


void initializeCudaMemoryAllocators()

{

  unified_memory_cuda_memory_allocator.initialize();

  device_cuda_memory_allocator.initialize();

  host_pinned_cuda_memory_allocator.initialize();

}


void finalizeCudaMemoryAllocators(ITraceMng* tm)

{

  unified_memory_cuda_memory_allocator.finalize(tm);

  device_cuda_memory_allocator.finalize(tm);

  host_pinned_cuda_memory_allocator.finalize(tm);

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


void arcaneCheckCudaErrors(const TraceInfo& ti, CUresult e)

{

  if (e == CUDA_SUCCESS)

    return;

  const char* error_name = nullptr;

  CUresult e2 = cuGetErrorName(e, &error_name);

  if (e2 != CUDA_SUCCESS)

    error_name = "Unknown";


  const char* error_message = nullptr;

  CUresult e3 = cuGetErrorString(e, &error_message);

  if (e3 != CUDA_SUCCESS)

    error_message = "Unknown";


  ARCCORE_FATAL("CUDA Error trace={0} e={1} name={2} message={3}",

                ti, e, error_name, error_message);

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class OccupancyMap

{

 public:


  Int32 getNbThreadPerBlock(const void* kernel_ptr)

  {

    std::scoped_lock lock(m_mutex);

    auto x = m_nb_thread_per_block_map.find(kernel_ptr);

    if (x != m_nb_thread_per_block_map.end())

      return x->second;

    int min_grid_size = 0;

    int computed_block_size = 0;

    int wanted_shared_memory = 0;

    cudaError_t r = cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &computed_block_size, kernel_ptr, wanted_shared_memory);

    if (r != cudaSuccess)

      computed_block_size = 0;

    int num_block_0 = 0;

    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_0, kernel_ptr, 256, wanted_shared_memory);

    int num_block_1 = 0;

    cudaOccupancyMaxActiveBlocksPerMultiprocessor(&num_block_1, kernel_ptr, 1024, wanted_shared_memory);


    cudaFuncAttributes func_attr;

    cudaFuncGetAttributes(&func_attr, kernel_ptr);

    m_nb_thread_per_block_map[kernel_ptr] = computed_block_size;

    std::cout << "ComputedBlockSize=" << computed_block_size << " n0=" << num_block_0 << " n1=" << num_block_1

              << " min_grid_size=" << min_grid_size << " nb_reg=" << func_attr.numRegs;


#if CUDART_VERSION >= 12040

    // cudaFuncGetName is only available in 12.4

    const char* func_name = nullptr;

    cudaFuncGetName(&func_name, kernel_ptr);

    std::cout << " name=" << func_name << "\n";

#endif


    return computed_block_size;

  }


 private:


  std::unordered_map<const void*, Int32> m_nb_thread_per_block_map;

  std::mutex m_mutex;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class CudaRunQueueStream

: public Impl::IRunQueueStream

{

 public:


  CudaRunQueueStream(Impl::IRunnerRuntime* runtime, const RunQueueBuildInfo& bi)

  : m_runtime(runtime)

  {

    if (bi.isDefault())

      ARCCORE_CHECK_CUDA(cudaStreamCreate(&m_cuda_stream));

    else {

      int priority = bi.priority();

      ARCCORE_CHECK_CUDA(cudaStreamCreateWithPriority(&m_cuda_stream, cudaStreamDefault, priority));

    }

  }

  ~CudaRunQueueStream() override

  {

    ARCCORE_CHECK_CUDA_NOTHROW(cudaStreamDestroy(m_cuda_stream));

  }


 public:


  void notifyBeginLaunchKernel([[maybe_unused]] Impl::RunCommandImpl& c) override

  {

#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT

    auto kname = c.kernelName();

    if (kname.empty())

      nvtxRangePush(c.traceInfo().name());

    else

      nvtxRangePush(kname.localstr());

#endif

    return m_runtime->notifyBeginLaunchKernel();

  }


  void notifyEndLaunchKernel(Impl::RunCommandImpl&) override

  {

#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT

    nvtxRangePop();

#endif

    return m_runtime->notifyEndLaunchKernel();

  }


  void barrier() override

  {

    ARCCORE_CHECK_CUDA(cudaStreamSynchronize(m_cuda_stream));

    if (global_cupti_flush > 0)

      global_cupti_info.flush();

  }


  bool _barrierNoException() override

  {

    return (cudaStreamSynchronize(m_cuda_stream) != cudaSuccess);

  }


  void copyMemory(const MemoryCopyArgs& args) override

  {

    auto source_bytes = args.source().bytes();

    auto r = cudaMemcpyAsync(args.destination().data(), source_bytes.data(),

                             source_bytes.size(), cudaMemcpyDefault, m_cuda_stream);

    ARCCORE_CHECK_CUDA(r);

    if (!args.isAsync())

      barrier();

  }


  void prefetchMemory(const MemoryPrefetchArgs& args) override

  {

    auto src = args.source().bytes();

    if (src.size() == 0)

      return;

    DeviceId d = args.deviceId();

    int device = cudaCpuDeviceId;

    if (!d.isHost())

      device = d.asInt32();

    //std::cout << "PREFETCH device=" << device << " host(id)=" << cudaCpuDeviceId

    //          << " size=" << args.source().size() << " data=" << src.data() << "\n";

    auto mem_location = _getMemoryLocation(device);

#if defined(ARCCORE_USING_CUDA13_OR_GREATER)

    auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, 0, m_cuda_stream);

#else

    auto r = cudaMemPrefetchAsync(src.data(), src.size(), mem_location, m_cuda_stream);

#endif

    ARCCORE_CHECK_CUDA(r);

    if (!args.isAsync())

      barrier();

  }


  Impl::NativeStream nativeStream() override

  {

    return Impl::NativeStream(&m_cuda_stream);

  }


 public:


  cudaStream_t trueStream() const

  {

    return m_cuda_stream;

  }


 private:


  Impl::IRunnerRuntime* m_runtime = nullptr;

  cudaStream_t m_cuda_stream = nullptr;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class CudaRunQueueEvent

: public Impl::IRunQueueEventImpl

{

 public:


  explicit CudaRunQueueEvent(bool has_timer)

  {

    if (has_timer)

      ARCCORE_CHECK_CUDA(cudaEventCreate(&m_cuda_event));

    else

      ARCCORE_CHECK_CUDA(cudaEventCreateWithFlags(&m_cuda_event, cudaEventDisableTiming));

  }

  ~CudaRunQueueEvent() override

  {

    ARCCORE_CHECK_CUDA_NOTHROW(cudaEventDestroy(m_cuda_event));

  }


 public:


  // Register the event within a RunQueue

  void recordQueue(Impl::IRunQueueStream* stream) final

  {

    auto* rq = static_cast<CudaRunQueueStream*>(stream);

    ARCCORE_CHECK_CUDA(cudaEventRecord(m_cuda_event, rq->trueStream()));

  }


  void wait() final

  {

    ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));

  }


  void waitForEvent(Impl::IRunQueueStream* stream) final

  {

    auto* rq = static_cast<CudaRunQueueStream*>(stream);

    ARCCORE_CHECK_CUDA(cudaStreamWaitEvent(rq->trueStream(), m_cuda_event, cudaEventWaitDefault));

  }


  Int64 elapsedTime(IRunQueueEventImpl* start_event) final

  {

    // NOTE: Events must have been created with the timer active

    ARCCORE_CHECK_POINTER(start_event);

    auto* true_start_event = static_cast<CudaRunQueueEvent*>(start_event);

    float time_in_ms = 0.0;


    // TODO: check if necessary

    // ARCCORE_CHECK_CUDA(cudaEventSynchronize(m_cuda_event));


    ARCCORE_CHECK_CUDA(cudaEventElapsedTime(&time_in_ms, true_start_event->m_cuda_event, m_cuda_event));

    double x = time_in_ms * 1.0e6;

    Int64 nano_time = static_cast<Int64>(x);

    return nano_time;

  }


  bool hasPendingWork() final

  {

    cudaError_t v = cudaEventQuery(m_cuda_event);

    if (v == cudaErrorNotReady)

      return true;

    ARCCORE_CHECK_CUDA(v);

    return false;

  }


 private:


  cudaEvent_t m_cuda_event;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class CudaRunnerRuntime

: public Impl::IRunnerRuntime

{

 public:


  ~CudaRunnerRuntime() override = default;


 public:


  void notifyBeginLaunchKernel() override

  {

    ++m_nb_kernel_launched;

    if (m_is_verbose)

      std::cout << "BEGIN CUDA KERNEL!\n";

  }

  void notifyEndLaunchKernel() override

  {

    ARCCORE_CHECK_CUDA(cudaGetLastError());

    if (m_is_verbose)

      std::cout << "END CUDA KERNEL!\n";

  }

  void barrier() override

  {

    ARCCORE_CHECK_CUDA(cudaDeviceSynchronize());

  }

  eExecutionPolicy executionPolicy() const override

  {

    return eExecutionPolicy::CUDA;

  }

  Impl::IRunQueueStream* createStream(const RunQueueBuildInfo& bi) override

  {

    return new CudaRunQueueStream(this, bi);

  }

  Impl::IRunQueueEventImpl* createEventImpl() override

  {

    return new CudaRunQueueEvent(false);

  }

  Impl::IRunQueueEventImpl* createEventImplWithTimer() override

  {

    return new CudaRunQueueEvent(true);

  }

  void setMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override

  {

    auto v = buffer.bytes();

    const void* ptr = v.data();

    size_t count = v.size();

    int device = device_id.asInt32();

    cudaMemoryAdvise cuda_advise;


    if (advice == eMemoryAdvice::MostlyRead)

      cuda_advise = cudaMemAdviseSetReadMostly;

    else if (advice == eMemoryAdvice::PreferredLocationDevice)

      cuda_advise = cudaMemAdviseSetPreferredLocation;

    else if (advice == eMemoryAdvice::AccessedByDevice)

      cuda_advise = cudaMemAdviseSetAccessedBy;

    else if (advice == eMemoryAdvice::PreferredLocationHost) {

      cuda_advise = cudaMemAdviseSetPreferredLocation;

      device = cudaCpuDeviceId;

    }

    else if (advice == eMemoryAdvice::AccessedByHost) {

      cuda_advise = cudaMemAdviseSetAccessedBy;

      device = cudaCpuDeviceId;

    }

    else

      return;

    //std::cout << "MEMADVISE p=" << ptr << " size=" << count << " advise = " << cuda_advise << " id = " << device << "\n";

    ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));

  }

  void unsetMemoryAdvice(ConstMemoryView buffer, eMemoryAdvice advice, DeviceId device_id) override

  {

    auto v = buffer.bytes();

    const void* ptr = v.data();

    size_t count = v.size();

    int device = device_id.asInt32();

    cudaMemoryAdvise cuda_advise;


    if (advice == eMemoryAdvice::MostlyRead)

      cuda_advise = cudaMemAdviseUnsetReadMostly;

    else if (advice == eMemoryAdvice::PreferredLocationDevice)

      cuda_advise = cudaMemAdviseUnsetPreferredLocation;

    else if (advice == eMemoryAdvice::AccessedByDevice)

      cuda_advise = cudaMemAdviseUnsetAccessedBy;

    else if (advice == eMemoryAdvice::PreferredLocationHost) {

      cuda_advise = cudaMemAdviseUnsetPreferredLocation;

      device = cudaCpuDeviceId;

    }

    else if (advice == eMemoryAdvice::AccessedByHost) {

      cuda_advise = cudaMemAdviseUnsetAccessedBy;

      device = cudaCpuDeviceId;

    }

    else

      return;

    ARCCORE_CHECK_CUDA(cudaMemAdvise(ptr, count, cuda_advise, _getMemoryLocation(device)));

  }


  void setCurrentDevice(DeviceId device_id) final

  {

    Int32 id = device_id.asInt32();

    if (!device_id.isAccelerator())

      ARCCORE_FATAL("Device {0} is not an accelerator device", id);

    ARCCORE_CHECK_CUDA(cudaSetDevice(id));

  }


  const IDeviceInfoList* deviceInfoList() final { return &m_device_info_list; }


  void startProfiling() override

  {

    global_cupti_info.start();

  }


  void stopProfiling() override

  {

    global_cupti_info.stop();

  }


  bool isProfilingActive() override

  {

    return global_cupti_info.isActive();

  }


  void getPointerAttribute(PointerAttribute& attribute, const void* ptr) override

  {

    cudaPointerAttributes ca;

    ARCCORE_CHECK_CUDA(cudaPointerGetAttributes(&ca, ptr));

    // NOTE: the Arcane type 'ePointerMemoryType' normally has the same values

    // as the corresponding CUDA type, so a simple cast can be done.

    auto mem_type = static_cast<ePointerMemoryType>(ca.type);

    _fillPointerAttribute(attribute, mem_type, ca.device,

                          ptr, ca.devicePointer, ca.hostPointer);

  }


  DeviceMemoryInfo getDeviceMemoryInfo(DeviceId device_id) override

  {

    int d = 0;

    int wanted_d = device_id.asInt32();

    ARCCORE_CHECK_CUDA(cudaGetDevice(&d));

    if (d != wanted_d)

      ARCCORE_CHECK_CUDA(cudaSetDevice(wanted_d));

    size_t free_mem = 0;

    size_t total_mem = 0;

    ARCCORE_CHECK_CUDA(cudaMemGetInfo(&free_mem, &total_mem));

    if (d != wanted_d)

      ARCCORE_CHECK_CUDA(cudaSetDevice(d));

    DeviceMemoryInfo dmi;

    dmi.setFreeMemory(free_mem);

    dmi.setTotalMemory(total_mem);

    return dmi;

  }


  void pushProfilerRange(const String& name, Int32 color_rgb) override

  {

#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT

    if (color_rgb >= 0) {

      // NOTE: It would be necessary to do: nvtxEventAttributes_t eventAttrib = { 0 };

      // but this causes many 'missing initializer for member' warnings

      nvtxEventAttributes_t eventAttrib;

      std::memset(&eventAttrib, 0, sizeof(nvtxEventAttributes_t));

      eventAttrib.version = NVTX_VERSION;

      eventAttrib.size = NVTX_EVENT_ATTRIB_STRUCT_SIZE;

      eventAttrib.colorType = NVTX_COLOR_ARGB;

      eventAttrib.color = color_rgb;

      eventAttrib.messageType = NVTX_MESSAGE_TYPE_ASCII;

      eventAttrib.message.ascii = name.localstr();

      nvtxRangePushEx(&eventAttrib);

    }

    else

      nvtxRangePush(name.localstr());

#endif

  }

  void popProfilerRange() override

  {

#ifdef ARCCORE_HAS_CUDA_NVTOOLSEXT

    nvtxRangePop();

#endif

  }


  void finalize(ITraceMng* tm) override

  {

    finalizeCudaMemoryAllocators(tm);

  }


  KernelLaunchArgs computeKernalLaunchArgs(const KernelLaunchArgs& orig_args,

                                           const void* kernel_ptr,

                                           Int64 total_loop_size) override

  {

    Int32 shared_memory = orig_args.sharedMemorySize();

    if (orig_args.isCooperative()) {

      // In cooperative mode, ensure that we do not launch more blocks

      // than the maximum that can reside on the GPU.

      Int32 nb_thread = orig_args.nbThreadPerBlock();

      Int32 nb_block = orig_args.nbBlockPerGrid();

      int nb_block_per_sm = 0;

      ARCCORE_CHECK_CUDA(cudaOccupancyMaxActiveBlocksPerMultiprocessor(&nb_block_per_sm, kernel_ptr, nb_thread, shared_memory));


      int max_block = static_cast<int>((nb_block_per_sm * m_multi_processor_count) * m_cooperative_ratio);

      max_block = std::max(max_block, 1);

      if (nb_block > max_block) {

        KernelLaunchArgs modified_args(orig_args);

        modified_args.setNbBlockPerGrid(max_block);

        return modified_args;

      }

      return orig_args;

    }


    if (!m_use_computed_occupancy)

      return orig_args;

    if (shared_memory < 0)

      shared_memory = 0;

    // For now, we do not perform calculation if shared memory is non-zero.

    if (shared_memory != 0)

      return orig_args;

    Int32 computed_block_size = m_occupancy_map.getNbThreadPerBlock(kernel_ptr);

    if (computed_block_size == 0)

      return orig_args;


    // Here, we use the number of threads per block to achieve a

    // maximum occupancy.

    KernelLaunchArgs modified_args(orig_args);

    Int64 big_b = (total_loop_size + computed_block_size - 1) / computed_block_size;

    int blocks_per_grid = CheckedConvert::toInt32(big_b);

    modified_args.setNbBlockPerGrid(blocks_per_grid);

    modified_args.setNbThreadPerBlock(computed_block_size);

    return modified_args;

  }


 public:


  void fillDevices(bool is_verbose);

  void build()

  {

    if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_USE_COMPUTED_OCCUPANCY", true))

      m_use_computed_occupancy = v.value();

    if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_ACCELERATOR_COOPERATIVE_RATIO", true)) {

      Int32 x = v.value();

      x = std::clamp(x, 10, 100);

      m_cooperative_ratio = x / 100.0;

    }

  }


 private:


  Int64 m_nb_kernel_launched = 0;

  bool m_is_verbose = false;

  bool m_use_computed_occupancy = false;

  Int32 m_multi_processor_count = 0;

  double m_cooperative_ratio = 1.0;

  Impl::DeviceInfoList m_device_info_list;

  OccupancyMap m_occupancy_map;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


void CudaRunnerRuntime::

fillDevices(bool is_verbose)

{

  int nb_device = 0;

  ARCCORE_CHECK_CUDA(cudaGetDeviceCount(&nb_device));

  std::ostream& omain = std::cout;

  if (is_verbose)

    omain << "ArcaneCUDA: Initialize Arcane CUDA runtime nb_available_device=" << nb_device << "\n";

  for (int i = 0; i < nb_device; ++i) {

    cudaDeviceProp dp;

    cudaGetDeviceProperties(&dp, i);

    int runtime_version = 0;

    cudaRuntimeGetVersion(&runtime_version);

    int driver_version = 0;

    cudaDriverGetVersion(&driver_version);

    std::ostringstream ostr;

    std::ostream& o = ostr;

    o << "Device " << i << " name=" << dp.name << "\n";

    o << " Driver version = " << (driver_version / 1000) << "." << (driver_version % 1000) << "\n";

    o << " Runtime version = " << (runtime_version / 1000) << "." << (runtime_version % 1000) << "\n";

    o << " computeCapability = " << dp.major << "." << dp.minor << "\n";

    o << " totalGlobalMem = " << dp.totalGlobalMem << "\n";

    o << " sharedMemPerBlock = " << dp.sharedMemPerBlock << "\n";

    o << " sharedMemPerMultiprocessor = " << dp.sharedMemPerMultiprocessor << "\n";

    o << " sharedMemPerBlockOptin = " << dp.sharedMemPerBlockOptin << "\n";

    o << " regsPerBlock = " << dp.regsPerBlock << "\n";

    o << " warpSize = " << dp.warpSize << "\n";

    o << " memPitch = " << dp.memPitch << "\n";

    o << " maxThreadsPerBlock = " << dp.maxThreadsPerBlock << "\n";

    o << " maxBlocksPerMultiProcessor = " << dp.maxBlocksPerMultiProcessor << "\n";

    o << " maxThreadsPerMultiProcessor = " << dp.maxThreadsPerMultiProcessor << "\n";

    o << " totalConstMem = " << dp.totalConstMem << "\n";

    o << " cooperativeLaunch = " << dp.cooperativeLaunch << "\n";

    o << " multiProcessorCount = " << dp.multiProcessorCount << "\n";

    o << " integrated = " << dp.integrated << "\n";

    o << " canMapHostMemory = " << dp.canMapHostMemory << "\n";

    o << " directManagedMemAccessFromHost = " << dp.directManagedMemAccessFromHost << "\n";

    o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";

    o << " pageableMemoryAccess = " << dp.pageableMemoryAccess << "\n";

    o << " concurrentManagedAccess = " << dp.concurrentManagedAccess << "\n";

    o << " pageableMemoryAccessUsesHostPageTables = " << dp.pageableMemoryAccessUsesHostPageTables << "\n";

    o << " hostNativeAtomicSupported = " << dp.hostNativeAtomicSupported << "\n";

    o << " maxThreadsDim = " << dp.maxThreadsDim[0] << " " << dp.maxThreadsDim[1]

      << " " << dp.maxThreadsDim[2] << "\n";

    o << " maxGridSize = " << dp.maxGridSize[0] << " " << dp.maxGridSize[1]

      << " " << dp.maxGridSize[2] << "\n";

    o << " pciInfo = " << dp.pciDomainID << " " << dp.pciBusID << " " << dp.pciDeviceID << "\n";

    o << " memoryBusWitdh = " << dp.memoryBusWidth << " bits\n";


    int clock_rate = 0;

    ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&clock_rate, cudaDevAttrClockRate, i));

    o << " clockRate = " << (clock_rate / 1000) << " MHz\n";


    int memory_clock_rate = 0;

    ARCCORE_CHECK_CUDA(cudaDeviceGetAttribute(&memory_clock_rate, cudaDevAttrMemoryClockRate, i));

    o << " memoryClockRate = " << (memory_clock_rate / 1000) << " MHz\n";


    Real memory_bandwith = ((dp.memoryBusWidth * memory_clock_rate * 2.0) / 8.0) / 1.0e6;

    o << " MemoryBandwith = " << memory_bandwith << " GB/s\n";


#if !defined(ARCCORE_USING_CUDA13_OR_GREATER)

    o << " deviceOverlap = " << dp.deviceOverlap << "\n";

    o << " computeMode = " << dp.computeMode << "\n";

    o << " kernelExecTimeoutEnabled = " << dp.kernelExecTimeoutEnabled << "\n";

#endif


    // TODO: We assume that all GPUs are the same and therefore

    // that the number of SM per GPU is the same. This is used to

    // calculate the number of blocks in cooperative mode.

    m_multi_processor_count = dp.multiProcessorCount;


    {

      int least_val = 0;

      int greatest_val = 0;

      ARCCORE_CHECK_CUDA(cudaDeviceGetStreamPriorityRange(&least_val, &greatest_val));

      o << " leastPriority = " << least_val << " greatestPriority = " << greatest_val << "\n";

    }

    std::ostringstream device_uuid_ostr;

    {

      CUdevice device;

      ARCCORE_CHECK_CUDA(cuDeviceGet(&device, i));

      CUuuid device_uuid;

      ARCCORE_CHECK_CUDA(cuDeviceGetUuid(&device_uuid, device));

      o << " deviceUuid=";

      Impl::printUUID(device_uuid_ostr, device_uuid.bytes);

      o << device_uuid_ostr.str();

      o << "\n";

    }

    String description(ostr.str());

    if (is_verbose)

      omain << description;


    DeviceInfo device_info;

    device_info.setDescription(description);

    device_info.setDeviceId(DeviceId(i));

    device_info.setName(dp.name);

    device_info.setWarpSize(dp.warpSize);

    device_info.setUUIDAsString(device_uuid_ostr.str());

    device_info.setSharedMemoryPerBlock(static_cast<Int32>(dp.sharedMemPerBlock));

    device_info.setSharedMemoryPerMultiprocessor(static_cast<Int32>(dp.sharedMemPerMultiprocessor));

    device_info.setSharedMemoryPerBlockOptin(static_cast<Int32>(dp.sharedMemPerBlockOptin));

    device_info.setTotalConstMemory(static_cast<Int32>(dp.totalConstMem));

    device_info.setPCIDomainID(dp.pciDomainID);

    device_info.setPCIBusID(dp.pciBusID);

    device_info.setPCIDeviceID(dp.pciDeviceID);

    m_device_info_list.addDevice(device_info);

  }


  Int32 global_cupti_level = 0;


  // Check if Cupti is active

  if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_LEVEL", true))

    global_cupti_level = v.value();

  if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_FLUSH", true))

    global_cupti_flush = v.value();

  bool do_print_cupti = true;

  if (auto v = Convert::Type<Int32>::tryParseFromEnvironment("ARCANE_CUPTI_PRINT", true))

    do_print_cupti = (v.value() != 0);


  if (global_cupti_level > 0) {

#ifndef ARCCORE_HAS_CUDA_CUPTI

    ARCCORE_FATAL("Trying to enable CUPTI but Arcane is not compiled with cupti support");

#endif

    global_cupti_info.init(global_cupti_level, do_print_cupti);

    global_cupti_info.start();

  }

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


class CudaMemoryCopier

: public IMemoryCopier

{


  void copy(ConstMemoryView from, [[maybe_unused]] eMemoryResource from_mem,

            MutableMemoryView to, [[maybe_unused]] eMemoryResource to_mem,

            const RunQueue* queue) override

  {

    if (queue) {

      queue->copyMemory(MemoryCopyArgs(to.bytes(), from.bytes()).addAsync(queue->isAsync()));

      return;

    }

    // 'cudaMemcpyDefault' automatically knows what to do by only considering

    // the pointer values. We should see if using \a from_mem and \a to_mem

    // can improve performance.

    ARCCORE_CHECK_CUDA(cudaMemcpy(to.data(), from.data(), from.bytes().size(), cudaMemcpyDefault));

  }


};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


} // End namespace Arcane::Accelerator::Cuda


using namespace Arcane;


namespace

{

Accelerator::Cuda::CudaRunnerRuntime global_cuda_runtime;

Accelerator::Cuda::CudaMemoryCopier global_cuda_memory_copier;


void _setAllocator(Accelerator::AcceleratorMemoryAllocatorBase* allocator)

{

  IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();

  eMemoryResource mem = allocator->memoryResource();

  mrm->setAllocator(mem, allocator);

  mrm->setMemoryPool(mem, allocator->memoryPool());

}


} // namespace


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


// This function is the entry point used when dynamically loading

// this library

extern "C" ARCCORE_EXPORT void

arcaneRegisterAcceleratorRuntimecuda(Arcane::Accelerator::RegisterRuntimeInfo& init_info)

{

  using namespace Arcane::Accelerator::Cuda;

  global_cuda_runtime.build();

  Accelerator::Impl::setUsingCUDARuntime(true);

  Accelerator::Impl::setCUDARunQueueRuntime(&global_cuda_runtime);

  initializeCudaMemoryAllocators();

  MemoryUtils::setDefaultDataMemoryResource(eMemoryResource::UnifiedMemory);

  MemoryUtils::setAcceleratorHostMemoryAllocator(&unified_memory_cuda_memory_allocator);

  IMemoryResourceMngInternal* mrm = MemoryUtils::getDataMemoryResourceMng()->_internal();

  mrm->setIsAccelerator(true);

  _setAllocator(&unified_memory_cuda_memory_allocator);

  _setAllocator(&host_pinned_cuda_memory_allocator);

  _setAllocator(&device_cuda_memory_allocator);

  mrm->setCopier(&global_cuda_memory_copier);

  global_cuda_runtime.fillDevices(init_info.isVerbose());

}


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

ARCCORE_FATAL
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
Definition ArccoreGlobal.h:552

ARCCORE_CHECK_POINTER
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
Definition ArccoreGlobal.h:800

Arcane::Accelerator::AcceleratorMemoryAllocatorBase::IUnderlyingAllocator
Definition AcceleratorMemoryAllocatorBase.h:153

Arcane::Accelerator::AcceleratorMemoryAllocatorBase
Base class of a specific allocator for accelerator.
Definition AcceleratorMemoryAllocatorBase.h:133

Arcane::Accelerator::AcceleratorMemoryAllocatorBase::memoryResource
eMemoryResource memoryResource() const final
Memory resource provided by the allocator.
Definition AcceleratorMemoryAllocatorBase.h:201

Arcane::Accelerator::AcceleratorMemoryAllocatorBase::_doInitializeDevice
void _doInitializeDevice(bool default_use_memory_pool=false)
Initialization for Device memory.
Definition AcceleratorMemoryAllocatorBase.cc:147

Arcane::Accelerator::AcceleratorMemoryAllocatorBase::_doInitializeHostPinned
void _doInitializeHostPinned(bool default_use_memory_pool=false)
Initialization for HostPinned memory.
Definition AcceleratorMemoryAllocatorBase.cc:136

Arcane::Accelerator::AcceleratorMemoryAllocatorBase::_doInitializeUVM
void _doInitializeUVM(bool default_use_memory_pool=false)
Initialization for UVM memory.
Definition AcceleratorMemoryAllocatorBase.cc:120

Arcane::Accelerator::Cuda::ConcreteAllocator
Definition CudaAcceleratorRuntime.cc:95

Arcane::Accelerator::Cuda::CudaMemoryCopier
Definition CudaAcceleratorRuntime.cc:1065

Arcane::Accelerator::Cuda::CudaMemoryCopier::copy
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copies the data from from to to with the queue queue.
Definition CudaAcceleratorRuntime.cc:1066

Arcane::Accelerator::Cuda::CudaRunQueueEvent
Definition CudaAcceleratorRuntime.cc:611

Arcane::Accelerator::Cuda::CudaRunQueueStream
Definition CudaAcceleratorRuntime.cc:510

Arcane::Accelerator::Cuda::CudaRunQueueStream::barrier
void barrier() override
Blocks until all actions associated with this queue are finished.
Definition CudaAcceleratorRuntime.cc:548

Arcane::Accelerator::Cuda::CudaRunQueueStream::notifyBeginLaunchKernel
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification before command launch.
Definition CudaAcceleratorRuntime.cc:530

Arcane::Accelerator::Cuda::CudaRunQueueStream::_barrierNoException
bool _barrierNoException() override
Barrier without exception. Returns true in case of error.
Definition CudaAcceleratorRuntime.cc:554

Arcane::Accelerator::Cuda::CudaRunQueueStream::nativeStream
Impl::NativeStream nativeStream() override
Pointer to the internal structure dependent on the implementation.
Definition CudaAcceleratorRuntime.cc:588

Arcane::Accelerator::Cuda::CudaRunQueueStream::prefetchMemory
void prefetchMemory(const MemoryPrefetchArgs &args) override
Performs a prefetch of a memory region.
Definition CudaAcceleratorRuntime.cc:567

Arcane::Accelerator::Cuda::CudaRunQueueStream::notifyEndLaunchKernel
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification of command launch completion.
Definition CudaAcceleratorRuntime.cc:541

Arcane::Accelerator::Cuda::CudaRunQueueStream::copyMemory
void copyMemory(const MemoryCopyArgs &args) override
Performs a copy between two memory regions.
Definition CudaAcceleratorRuntime.cc:558

Arcane::Accelerator::Cuda::CudaRunnerRuntime
Definition CudaAcceleratorRuntime.cc:681

Arcane::Accelerator::Cuda::CuptiInfo
Singleton class to manage CUPTI.
Definition Cupti.h:39

Arcane::Accelerator::Cuda::DeviceCudaMemoryAllocator
Definition CudaAcceleratorRuntime.cc:386

Arcane::Accelerator::Cuda::HostPinnedConcreteAllocator
Definition CudaAcceleratorRuntime.cc:301

Arcane::Accelerator::Cuda::HostPinnedCudaMemoryAllocator
Definition CudaAcceleratorRuntime.cc:320

Arcane::Accelerator::Cuda::OccupancyMap
Map containing the ideal occupancy for a given kernel.
Definition CudaAcceleratorRuntime.cc:463

Arcane::Accelerator::Cuda::UnderlyingAllocator
Definition CudaAcceleratorRuntime.cc:112

Arcane::Accelerator::Cuda::UnderlyingAllocator::allocateMemory
void * allocateMemory(Int64 size) final
Allocates a block for size bytes.
Definition CudaAcceleratorRuntime.cc:119

Arcane::Accelerator::Cuda::UnderlyingAllocator::freeMemory
void freeMemory(void *ptr, Int64 size) final
Frees the block located at address address containing size bytes.
Definition CudaAcceleratorRuntime.cc:125

Arcane::Accelerator::Cuda::UnifiedMemoryConcreteAllocator::m_use_hint_as_mainly_device
bool m_use_hint_as_mainly_device
Definition CudaAcceleratorRuntime.cc:212

Arcane::Accelerator::Cuda::UnifiedMemoryCudaMemoryAllocator
Allocator for unified memory.
Definition CudaAcceleratorRuntime.cc:227

Arcane::Accelerator::Cuda::UnifiedMemoryCudaMemoryAllocator::notifyMemoryArgsChanged
void notifyMemoryArgsChanged(MemoryAllocationArgs old_args, MemoryAllocationArgs new_args, AllocatedMemoryInfo ptr) final
Notifies of a change in instance-specific arguments.
Definition CudaAcceleratorRuntime.cc:245

Arcane::Accelerator::DeviceId
Identifier of a system component.
Definition arccore/src/common/arccore/common/accelerator/DeviceId.h:34

Arcane::Accelerator::DeviceId::isHost
bool isHost() const
Indicates if the instance is associated with the host.
Definition arccore/src/common/arccore/common/accelerator/DeviceId.h:61

Arcane::Accelerator::DeviceId::asInt32
Int32 asInt32() const
Numerical value of the device.
Definition arccore/src/common/arccore/common/accelerator/DeviceId.h:70

Arcane::Accelerator::DeviceId::isAccelerator
bool isAccelerator() const
Indicates if the instance is associated with an accelerator.
Definition arccore/src/common/arccore/common/accelerator/DeviceId.h:67

Arcane::Accelerator::DeviceInfo
Information about an accelerator.
Definition arccore/src/common/arccore/common/accelerator/DeviceInfo.h:33

Arcane::Accelerator::DeviceMemoryInfo
Accelerator memory information.
Definition arccore/src/common/arccore/common/accelerator/DeviceMemoryInfo.h:32

Arcane::Accelerator::IDeviceInfoList
Interface of a list of devices.
Definition arccore/src/common/arccore/common/accelerator/IDeviceInfoList.h:32

Arcane::Accelerator::Impl::DeviceInfoList
Interface for a list of devices.
Definition arccore/src/common/arccore/common/accelerator/DeviceInfoList.h:36

Arcane::Accelerator::Impl::IRunQueueEventImpl
Interface for event implementation.
Definition IRunQueueEventImpl.h:33

Arcane::Accelerator::Impl::IRunQueueStream
Interface of an execution stream for a RunQueue.
Definition IRunQueueStream.h:33

Arcane::Accelerator::Impl::IRunnerRuntime
Interface of the runtime associated with an accelerator.
Definition IRunnerRuntime.h:36

Arcane::Accelerator::Impl::KernelLaunchArgs
Arguments for launching a kernel.
Definition arccore/src/common/arccore/common/accelerator/KernelLaunchArgs.h:33

Arcane::Accelerator::Impl::KernelLaunchArgs::isCooperative
bool isCooperative() const
Indicates if running in cooperative mode (i.e. cudaLaunchCooperativeKernel).
Definition arccore/src/common/arccore/common/accelerator/KernelLaunchArgs.h:63

Arcane::Accelerator::Impl::KernelLaunchArgs::nbBlockPerGrid
Int32 nbBlockPerGrid() const
Number of grid blocks.
Definition arccore/src/common/arccore/common/accelerator/KernelLaunchArgs.h:48

Arcane::Accelerator::Impl::KernelLaunchArgs::setNbThreadPerBlock
void setNbThreadPerBlock(Int32 v)
Number of threads per block.
Definition arccore/src/common/arccore/common/accelerator/KernelLaunchArgs.h:55

Arcane::Accelerator::Impl::KernelLaunchArgs::setNbBlockPerGrid
void setNbBlockPerGrid(Int32 v)
Number of grid blocks.
Definition arccore/src/common/arccore/common/accelerator/KernelLaunchArgs.h:50

Arcane::Accelerator::Impl::KernelLaunchArgs::nbThreadPerBlock
Int32 nbThreadPerBlock() const
Number of threads per block.
Definition arccore/src/common/arccore/common/accelerator/KernelLaunchArgs.h:53

Arcane::Accelerator::Impl::KernelLaunchArgs::sharedMemorySize
Int32 sharedMemorySize() const
Shared memory to allocate for the kernel.
Definition arccore/src/common/arccore/common/accelerator/KernelLaunchArgs.h:58

Arcane::Accelerator::Impl::NativeStream
Opaque type to encapsulate a native 'stream'.
Definition arccore/src/common/arccore/common/accelerator/NativeStream.h:55

Arcane::Accelerator::Impl::RunCommandImpl
Implementation of a command for accelerator.
Definition arccore/src/common/arccore/common/accelerator/internal/RunCommandImpl.h:41

Arcane::Accelerator::MemoryCopyArgs
Memory copy arguments.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:63

Arcane::Accelerator::MemoryPrefetchArgs
Memory prefetching arguments.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:126

Arcane::Accelerator::PointerAttribute
Information about a memory address.
Definition arccore/src/common/arccore/common/accelerator/PointerAttribute.h:37

Arcane::Accelerator::RunQueueBuildInfo
Information to create a RunQueue.
Definition arccore/src/common/arccore/common/accelerator/RunQueueBuildInfo.h:32

Arcane::Accelerator::RunQueueBuildInfo::isDefault
bool isDefault() const
Indicates if the instance only has default values.
Definition arccore/src/common/arccore/common/accelerator/RunQueueBuildInfo.h:53

Arcane::Accelerator::RunQueue
Execution queue for an accelerator.
Definition arccore/src/common/arccore/common/accelerator/RunQueue.h:53

Arcane::Accelerator::RunQueue::isAsync
bool isAsync() const
Indicates if the execution queue is asynchronous.
Definition RunQueue.cc:320

Arcane::Accelerator::RunQueue::copyMemory
void copyMemory(const MemoryCopyArgs &args) const
Copies information between two memory regions.
Definition RunQueue.cc:237

Arcane::AllocatedMemoryInfo
Information about an allocated memory region.
Definition AllocatedMemoryInfo.h:32

Arcane::ConstMemoryView
Constant view on a contiguous memory region containing fixed-size elements.
Definition arccore/src/base/arccore/base/MemoryView.h:39

Arcane::ConstMemoryView::bytes
constexpr SpanType bytes() const
View in byte form.
Definition arccore/src/base/arccore/base/MemoryView.h:108

Arcane::ConstMemoryView::data
constexpr const std::byte * data() const
Pointer to the memory region.
Definition arccore/src/base/arccore/base/MemoryView.h:111

Arcane::Convert::Type
Template class for converting a type.
Definition arccore/src/base/arccore/base/Convert.h:151

Arcane::IMemoryCopier
Interface for memory copies with accelerator support.
Definition IMemoryCopier.h:33

Arcane::IMemoryResourceMngInternal
Internal part of Arcane's 'IMemoryResourceMng'.
Definition IMemoryResourceMngInternal.h:32

Arcane::IMemoryResourceMngInternal::setAllocator
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Sets the allocator for resource r.

Arcane::IMemoryResourceMngInternal::setMemoryPool
virtual void setMemoryPool(eMemoryResource r, IMemoryPool *pool)=0
Sets the memory pool for resource r.

Arcane::IMemoryResourceMngInternal::setIsAccelerator
virtual void setIsAccelerator(bool v)=0
Indicates if an accelerator is available.

Arcane::IMemoryResourceMngInternal::setCopier
virtual void setCopier(IMemoryCopier *copier)=0
Sets the copying instance.

Arcane::IMemoryResourceMng::_internal
virtual IMemoryResourceMngInternal * _internal()=0
Internal interface.

Arcane::ITraceMng
Trace manager interface.
Definition arccore/src/trace/arccore/trace/ITraceMng.h:175

Arcane::MemoryAllocationArgs
Class containing information to specialize allocations.
Definition common/arccore/common/MemoryAllocationArgs.h:32

Arcane::MutableMemoryView
Mutable view on a contiguous memory region containing fixed-size elements.
Definition arccore/src/base/arccore/base/MemoryView.h:158

Arcane::MutableMemoryView::data
constexpr std::byte * data() const
Pointer to the memory region.
Definition arccore/src/base/arccore/base/MemoryView.h:220

Arcane::MutableMemoryView::bytes
constexpr SpanType bytes() const
View in byte form.
Definition arccore/src/base/arccore/base/MemoryView.h:217

Arcane::SpanImpl::data
constexpr __host__ __device__ pointer data() const noexcept
Pointer to the start of the view.
Definition Span.h:537

Arcane::SpanImpl::size
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
Definition Span.h:325

Arcane::String
Unicode character string.
Definition arccore/src/base/arccore/base/String.h:70

Arcane::String::localstr
const char * localstr() const
Returns the conversion of the instance into UTF-8 encoding.
Definition String.cc:229

Arcane::TraceInfo
Trace information.
Definition arccore/src/base/arccore/base/TraceInfo.h:34

Arcane::Accelerator::eMemoryAdvice
eMemoryAdvice
Memory management advice.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:35

Arcane::Accelerator::eMemoryAdvice::AccessedByHost
@ AccessedByHost
Indicates that the memory region is accessed by the host.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:47

Arcane::Accelerator::eMemoryAdvice::PreferredLocationDevice
@ PreferredLocationDevice
Prefers memory placement on the accelerator.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:41

Arcane::Accelerator::eMemoryAdvice::MostlyRead
@ MostlyRead
Indicates that the memory region is primarily read-only.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:39

Arcane::Accelerator::eMemoryAdvice::PreferredLocationHost
@ PreferredLocationHost
Prefers memory placement on the host.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:43

Arcane::Accelerator::eMemoryAdvice::AccessedByDevice
@ AccessedByDevice
Indicates that the memory region is accessed by the device.
Definition arccore/src/common/arccore/common/accelerator/Memory.h:45

Arcane::Accelerator::ePointerMemoryType
ePointerMemoryType
Memory type for a pointer.
Definition CommonAcceleratorGlobal.h:153

Arcane::Accelerator::eExecutionPolicy
eExecutionPolicy
Execution policy for a Runner.
Definition CommonAcceleratorGlobal.h:90

Arcane::Accelerator::eExecutionPolicy::CUDA
@ CUDA
Execution policy using the CUDA environment.
Definition CommonAcceleratorGlobal.h:98

Arcane::MemoryUtils::getDataMemoryResourceMng
IMemoryRessourceMng * getDataMemoryResourceMng()
Memory resource manager for data.
Definition arccore/src/common/arccore/common/MemoryUtils.cc:131

Arcane::MemoryUtils::setAcceleratorHostMemoryAllocator
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Sets the specific allocator for accelerators.
Definition arccore/src/common/arccore/common/MemoryUtils.cc:152

Arcane::MemoryUtils::setDefaultDataMemoryResource
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Sets the memory resource used for the data memory allocator.
Definition arccore/src/common/arccore/common/MemoryUtils.cc:110

Arcane
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
Definition arcane/src/arcane/accelerator/AcceleratorGlobal.h:37

Arcane::Int64
std::int64_t Int64
Signed integer type of 64 bits.
Definition ArccoreGlobal.h:235

Arcane::eMemoryLocationHint
eMemoryLocationHint
Indices for expected memory location.
Definition CommonGlobal.h:131

Arcane::eMemoryLocationHint::MainlyHost
@ MainlyHost
Indicates that the data will primarily be used on the CPU.
Definition CommonGlobal.h:137

Arcane::eMemoryLocationHint::None
@ None
No hint.
Definition CommonGlobal.h:133

Arcane::eMemoryLocationHint::HostAndDeviceMostlyRead
@ HostAndDeviceMostlyRead
Indicates that the data will be used both on the accelerator and on the CPU and will not be frequentl...
Definition CommonGlobal.h:142

Arcane::eMemoryLocationHint::MainlyDevice
@ MainlyDevice
Indicates that the data will primarily be used on the accelerator.
Definition CommonGlobal.h:135

Arcane::Real
double Real
Type representing a real number.
Definition ArccoreGlobal.h:275

Arcane::eMemoryResource
eMemoryResource
List of available memory resources.
Definition CommonGlobal.h:179

Arcane::eMemoryResource::HostPinned
@ HostPinned
Allocates on the host.
Definition CommonGlobal.h:185

Arcane::eMemoryResource::UnifiedMemory
@ UnifiedMemory
Allocates using unified memory.
Definition CommonGlobal.h:189

Arcane::eMemoryResource::Device
@ Device
Allocates on the device.
Definition CommonGlobal.h:187

Arcane::Int32
std::int32_t Int32
Signed integer type of 32 bits.
Definition ArccoreGlobal.h:233