14#include "arccore/accelerator_native/SyclAccelerator.h"
16#include "arccore/base/FatalErrorException.h"
17#include "arccore/base/NotImplementedException.h"
18#include "arccore/base/NotSupportedException.h"
20#include "arccore/common/AlignedMemoryAllocator.h"
21#include "arccore/common/AllocatedMemoryInfo.h"
22#include "arccore/common/internal/MemoryUtilsInternal.h"
23#include "arccore/common/internal/IMemoryResourceMngInternal.h"
25#include "arccore/common/accelerator/RunQueueBuildInfo.h"
26#include "arccore/common/accelerator/Memory.h"
27#include "arccore/common/accelerator/DeviceInfoList.h"
28#include "arccore/common/accelerator/KernelLaunchArgs.h"
29#include "arccore/common/accelerator/RunQueue.h"
30#include "arccore/common/accelerator/DeviceMemoryInfo.h"
31#include "arccore/common/accelerator/NativeStream.h"
32#include "arccore/common/accelerator/internal/IRunnerRuntime.h"
33#include "arccore/common/accelerator/internal/RegisterRuntimeInfo.h"
34#include "arccore/common/accelerator/internal/RunCommandImpl.h"
35#include "arccore/common/accelerator/internal/IRunQueueStream.h"
36#include "arccore/common/accelerator/internal/IRunQueueEventImpl.h"
38namespace Arcane::Accelerator::Sycl
40using Arcane::Accelerator::Impl::KernelLaunchArgs;
42#define ARCCORE_SYCL_FUNC_NOT_HANDLED \
43 std::cout << "WARNING: SYCL: function not handled " << A_FUNCINFO << "\n"
53sycl::queue global_default_queue;
56 sycl::queue& _defaultQueue()
58 return global_default_queue;
68class SyclMemoryAllocatorBase
69:
public AlignedMemoryAllocator
73 SyclMemoryAllocatorBase()
74 : AlignedMemoryAllocator(128)
80 sycl::queue& q = _defaultQueue();
82 _allocate(&out, new_size, args, q);
87 ARCCORE_FATAL(
"Bad alignment for SYCL allocator: offset={0}", (a % 128));
88 return { out, new_size };
92 sycl::queue& q = _defaultQueue();
94 q.submit([&](sycl::handler& cgh) {
95 cgh.memcpy(a.baseAddress(), current_ptr.
baseAddress(), current_ptr.
size());
104 sycl::queue& q = _defaultQueue();
118:
public SyclMemoryAllocatorBase
124 *ptr = sycl::malloc_shared(new_size, q);
137:
public SyclMemoryAllocatorBase
144 *ptr = sycl::malloc_host(new_size, q);
157:
public SyclMemoryAllocatorBase
163 *ptr = sycl::malloc_device(new_size, q);
177 UnifiedMemorySyclMemoryAllocator unified_memory_sycl_memory_allocator;
178 HostPinnedSyclMemoryAllocator host_pinned_sycl_memory_allocator;
179 DeviceSyclMemoryAllocator device_sycl_memory_allocator;
185class SyclRunQueueStream
191 ~SyclRunQueueStream()
override
199 return m_runtime->notifyBeginLaunchKernel();
203 return m_runtime->notifyEndLaunchKernel();
207 m_sycl_stream->wait_and_throw();
211 m_sycl_stream->wait();
216 auto source_bytes = args.source().
bytes();
217 m_sycl_stream->memcpy(args.destination().
data(), source_bytes.data(),
218 source_bytes.size());
224 auto source_bytes = args.source().bytes();
225 Int64 nb_byte = source_bytes.size();
228 m_sycl_stream->prefetch(source_bytes.data(), nb_byte);
239 sycl::event last_event;
241 last_event = *(
reinterpret_cast<sycl::event*
>(sycl_event_ptr));
242 m_last_command_event = last_event;
247 static sycl::async_handler _getAsyncHandler()
249 auto err_handler = [](
const sycl::exception_list& exceptions) {
250 std::ostringstream ostr;
251 ostr <<
"Error in SYCL runtime\n";
252 for (
const std::exception_ptr& e : exceptions) {
254 std::rethrow_exception(e);
256 catch (
const sycl::exception& e) {
257 ostr <<
"SYCL exception: " << e.what() <<
"\n";
270 sycl::queue& trueStream()
const
272 return *m_sycl_stream;
278 std::unique_ptr<sycl::queue> m_sycl_stream;
279 sycl::event m_last_command_event;
285class SyclRunQueueEvent
290 explicit SyclRunQueueEvent([[maybe_unused]]
bool has_timer)
293 ~SyclRunQueueEvent()
override
305#if defined(__ADAPTIVECPP__)
306 m_recorded_stream = stream;
308#elif defined(__INTEL_LLVM_COMPILER)
324#if defined(__ADAPTIVECPP__)
326 m_sycl_event.wait(rq->trueStream().get_wait_list());
327#elif defined(__INTEL_LLVM_COMPILER)
328 std::vector<sycl::event> events;
329 events.push_back(m_sycl_event);
331 rq->trueStream().ext_oneapi_submit_barrier(events);
337 Int64 elapsedTime([[maybe_unused]] IRunQueueEventImpl* start_event)
final
342 sycl::event
event = (
static_cast<SyclRunQueueEvent*
>(start_event))->m_sycl_event;
344 if (event == sycl::event())
347 bool is_submitted =
event.get_info<sycl::info::event::command_execution_status>() == sycl::info::event_command_status::complete;
350 Int64 start =
event.get_profiling_info<sycl::info::event_profiling::command_start>();
351 Int64 end =
event.get_profiling_info<sycl::info::event_profiling::command_end>();
352 return (end - start);
355 bool hasPendingWork()
final
362 sycl::event m_sycl_event;
372 friend class SyclRunQueueStream;
376 void notifyBeginLaunchKernel()
override
379 void notifyEndLaunchKernel()
override
382 void barrier()
override
386 m_default_queue->wait();
394 return new SyclRunQueueStream(
this, bi);
405 [[maybe_unused]]
DeviceId device_id)
override
413 void setCurrentDevice([[maybe_unused]]
DeviceId device_id)
final
415 ARCCORE_SYCL_FUNC_NOT_HANDLED;
417 const IDeviceInfoList* deviceInfoList()
override {
return &m_device_info_list; }
419 void getPointerAttribute(
PointerAttribute& attribute,
const void* ptr)
override
421 sycl::usm::alloc sycl_mem_type = sycl::get_pointer_type(ptr, *m_default_context);
423 const void* host_ptr =
nullptr;
424 const void* device_ptr =
nullptr;
425 if (sycl_mem_type == sycl::usm::alloc::host) {
428 mem_type = ePointerMemoryType::Host;
433 else if (sycl_mem_type == sycl::usm::alloc::device) {
434 mem_type = ePointerMemoryType::Device;
437 else if (sycl_mem_type == sycl::usm::alloc::shared) {
438 mem_type = ePointerMemoryType::Managed;
446 _fillPointerAttribute(attribute, mem_type, device_id, ptr, device_ptr, host_ptr);
455 const void* kernel_ptr,
456 Int64 total_loop_size)
override
467 int nb_block_per_sm = 4;
468 int max_block = nb_block_per_sm * m_multi_processor_count;
469 if (nb_block > max_block) {
472 return modified_args;
478 void fillDevicesAndSetDefaultQueue(
bool is_verbose);
479 sycl::queue& defaultQueue()
const {
return *m_default_queue; }
480 sycl::device& defaultDevice()
const {
return *m_default_device; }
485 global_default_queue = sycl::queue{};
491 std::unique_ptr<sycl::device> m_default_device;
492 std::unique_ptr<sycl::context> m_default_context;
493 std::unique_ptr<sycl::queue> m_default_queue;
494 int m_multi_processor_count = 0;
498 void _init(sycl::device& device)
500 m_default_device = std::make_unique<sycl::device>(device);
501 m_default_queue = std::make_unique<sycl::queue>(device);
502 m_default_context = std::make_unique<sycl::context>(device);
513 sycl::device& d = runtime->defaultDevice();
516 auto queue_property = sycl::property::queue::in_order();
518 auto profiling_property = sycl::property::queue::enable_profiling();
519 sycl::property_list queue_properties(queue_property, profiling_property);
522 sycl::async_handler err_handler;
523 err_handler = _getAsyncHandler();
525 m_sycl_stream = std::make_unique<sycl::queue>(d, err_handler, queue_properties);
527 ARCCORE_SYCL_FUNC_NOT_HANDLED;
528 m_sycl_stream = std::make_unique<sycl::queue>(d, err_handler, queue_properties);
535void SyclRunnerRuntime::
536fillDevicesAndSetDefaultQueue(
bool is_verbose)
539 for (
auto platform : sycl::platform::get_platforms()) {
540 std::cout <<
"Platform: "
541 <<
platform.get_info<sycl::info::platform::name>()
546 sycl::device device{ sycl::gpu_selector_v };
548 std::cout <<
"\nDevice: " << device.get_info<sycl::info::device::name>()
549 <<
"\nVersion=" << device.get_info<sycl::info::device::version>()
550 <<
"\nDriverVersion=" << device.get_info<sycl::info::device::driver_version>()
551 <<
"\nMaxComputeUnits=" << device.get_info<sycl::info::device::max_compute_units>()
552 <<
"\nMaxWorkGroupSize=" << device.get_info<sycl::info::device::max_work_group_size>()
553 <<
"\nLocalMemSize=" << device.get_info<sycl::info::device::local_mem_size>()
554 <<
"\nGlobalMemSize=" << device.get_info<sycl::info::device::global_mem_size>()
555 <<
"\nMaxMemAllocSize=" << device.get_info<sycl::info::device::max_mem_alloc_size>()
557 m_multi_processor_count = device.get_info<sycl::info::device::max_compute_units>();
562 DeviceInfo device_info;
563 device_info.setDescription(
"No description info");
564 device_info.setDeviceId(DeviceId(0));
565 device_info.setName(device.get_info<sycl::info::device::name>());
566 m_device_info_list.addDevice(device_info);
594namespace Arcane::Accelerator::Sycl
600void SyclMemoryCopier::
609 sycl::queue& q = global_sycl_runtime.defaultQueue();
620extern "C" ARCCORE_EXPORT
void
624 using namespace Arcane::Accelerator::Sycl;
625 Arcane::Accelerator::Impl::setUsingSYCLRuntime(
true);
626 Arcane::Accelerator::Impl::setSYCLRunQueueRuntime(&global_sycl_runtime);
634 mrm->
setCopier(&global_sycl_memory_copier);
635 global_sycl_runtime.fillDevicesAndSetDefaultQueue(init_info.isVerbose());
636 global_default_queue = global_sycl_runtime.defaultQueue();
#define ARCCORE_FATAL(...)
Macro throwing a FatalErrorException.
#define ARCCORE_THROW(exception_class,...)
Macro to throw an exception with formatting.
#define ARCCORE_CHECK_POINTER(ptr)
Macro that returns the pointer ptr if it is not null or throws an exception if it is null.
Identifier of a system component.
Accelerator memory information.
Interface of a list of devices.
Interface for a list of devices.
Interface for event implementation.
Interface of an execution stream for a RunQueue.
Interface of the runtime associated with an accelerator.
Arguments for launching a kernel.
bool isCooperative() const
Indicates if running in cooperative mode (i.e. cudaLaunchCooperativeKernel).
Int32 nbBlockPerGrid() const
Number of grid blocks.
void setNbBlockPerGrid(Int32 v)
Number of grid blocks.
Int32 sharedMemorySize() const
Shared memory to allocate for the kernel.
Opaque type to encapsulate a native 'stream'.
Implementation of a command for accelerator.
Memory prefetching arguments.
Information about a memory address.
Information to initialize the accelerator runtime.
Information to create a RunQueue.
bool isDefault() const
Indicates if the instance only has default values.
Execution queue for an accelerator.
bool isAsync() const
Indicates if the execution queue is asynchronous.
void copyMemory(const MemoryCopyArgs &args) const
Copies information between two memory regions.
eMemoryResource memoryResource() const override
Memory resource provided by the allocator.
eMemoryResource memoryResource() const override
Memory resource provided by the allocator.
bool hasRealloc(MemoryAllocationArgs) const override
Indicates whether the allocator supports realloc semantics.
AllocatedMemoryInfo reallocate(MemoryAllocationArgs args, AllocatedMemoryInfo current_ptr, Int64 new_size) override
Reallocates memory for new_size bytes and returns the pointer.
AllocatedMemoryInfo allocate(MemoryAllocationArgs args, Int64 new_size) override
void deallocate(MemoryAllocationArgs args, AllocatedMemoryInfo ptr) override
Frees the memory whose base address is ptr.
void copy(ConstMemoryView from, eMemoryResource from_mem, MutableMemoryView to, eMemoryResource to_mem, const RunQueue *queue) override
Copies the data from from to to with the queue queue.
void notifyBeginLaunchKernel(Impl::RunCommandImpl &c) override
Notification before command launch.
void barrier() override
Blocks until all actions associated with this queue are finished.
sycl::event lastCommandEvent()
Event corresponding to the last command.
void prefetchMemory(const MemoryPrefetchArgs &args) override
Performs a prefetch of a memory region.
Impl::NativeStream nativeStream() override
Pointer to the internal structure dependent on the implementation.
void copyMemory(const MemoryCopyArgs &args) override
Performs a copy between two memory regions.
bool _barrierNoException() override
Barrier without exception. Returns true in case of error.
void _setSyclLastCommandEvent(void *sycl_event_ptr) override
For SYCL, positions the event associated with the last executed command.
void notifyEndLaunchKernel(Impl::RunCommandImpl &) override
Notification of command launch completion.
eMemoryResource memoryResource() const override
Memory resource provided by the allocator.
Information about an allocated memory region.
void * baseAddress() const
Address of the start of the allocated region.
Int64 size() const
Size in bytes of the used memory region. (-1) if unknown.
Constant view on a contiguous memory region containing fixed-size elements.
constexpr SpanType bytes() const
View in byte form.
constexpr const std::byte * data() const
Pointer to the memory region.
Interface for memory copies with accelerator support.
Internal part of Arcane's 'IMemoryResourceMng'.
virtual void setAllocator(eMemoryResource r, IMemoryAllocator *allocator)=0
Sets the allocator for resource r.
virtual void setIsAccelerator(bool v)=0
Indicates if an accelerator is available.
virtual void setCopier(IMemoryCopier *copier)=0
Sets the copying instance.
virtual IMemoryResourceMngInternal * _internal()=0
Internal interface.
Class containing information to specialize allocations.
Mutable view on a contiguous memory region containing fixed-size elements.
constexpr std::byte * data() const
Pointer to the memory region.
constexpr SpanType bytes() const
View in byte form.
Exception when a function is not implemented.
Exception when an operation is not supported.
constexpr __host__ __device__ SizeType size() const noexcept
Returns the size of the array.
eMemoryAdvice
Memory management advice.
ePointerMemoryType
Memory type for a pointer.
eExecutionPolicy
Execution policy for a Runner.
@ SYCL
Execution policy using the SYCL environment.
IMemoryRessourceMng * getDataMemoryResourceMng()
Memory resource manager for data.
IMemoryAllocator * setAcceleratorHostMemoryAllocator(IMemoryAllocator *a)
Sets the specific allocator for accelerators.
void setDefaultDataMemoryResource(eMemoryResource mem_resource)
Sets the memory resource used for the data memory allocator.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
eMemoryResource
List of available memory resources.
@ HostPinned
Allocates on the host.
@ UnifiedMemory
Allocates using unified memory.
@ Device
Allocates on the device.
std::int32_t Int32
Signed integer type of 32 bits.