91 void initialize(Int64 block_size,
bool do_block_alloc)
99 void dumpStats(std::ostream& ostr,
const String& name)
101 ostr <<
"Allocator '" << name <<
"' : nb_allocate=" <<
m_nb_allocate
106 Int64 adjustedCapacity(Int64 wanted_capacity, Int64 element_size)
const
110 return wanted_capacity;
119 Int64 orig_capacity = wanted_capacity;
120 Int64 new_size = orig_capacity * element_size;
122 Int64 nb_iter = 4 + (4096 / block_size);
123 for (Int64 i = 0; i < nb_iter; ++i) {
124 if (new_size >= (4 * block_size))
129 new_size = _computeNextMultiple(new_size, block_size);
130 wanted_capacity = new_size / element_size;
131 if (wanted_capacity < orig_capacity)
132 wanted_capacity = orig_capacity;
133 return wanted_capacity;
136 void doAllocate(
void* ptr, [[maybe_unused]]
size_t new_size)
140 uintptr_t addr =
reinterpret_cast<uintptr_t
>(ptr);
161 static Int64 _computeNextMultiple(Int64 n, Int64 multiple)
163 Int64 new_n = n / multiple;
164 if ((n % multiple) != 0)
166 return (new_n * multiple);
175class CudaMemoryAllocatorBase
176:
public Arccore::AlignedMemoryAllocator3
180 using BaseClass = Arccore::AlignedMemoryAllocator3;
192 virtual cudaError_t _allocate(
void** ptr,
size_t new_size) = 0;
193 virtual cudaError_t _deallocate(
void* ptr) = 0;
196 class UnderlyingAllocator
201 explicit UnderlyingAllocator(CudaMemoryAllocatorBase* v)
211 ARCANE_CHECK_CUDA(m_base->m_concrete_allocator->_allocate(&out, size));
212 m_base->m_block_wrapper.doAllocate(out, size);
215 void freeMemory(
void* ptr, [[maybe_unused]]
size_t size)
override
217 ARCANE_CHECK_CUDA_NOTHROW(m_base->m_concrete_allocator->_deallocate(ptr));
222 CudaMemoryAllocatorBase* m_base =
nullptr;
228 : AlignedMemoryAllocator3(128)
229 , m_concrete_allocator(concrete_allocator)
230 , m_direct_sub_allocator(this)
231 , m_memory_pool(&m_direct_sub_allocator, allocator_name)
232 , m_sub_allocator(&m_direct_sub_allocator)
233 , m_allocator_name(allocator_name)
236 m_print_level = v.value();
239 ~CudaMemoryAllocatorBase()
247 if (m_print_level >= 1) {
249 if (m_use_memory_pool) {
250 m_memory_pool.dumpStats(ostr());
251 m_memory_pool.dumpFreeMap(ostr());
253 ostr() <<
"Allocator '" << m_allocator_name <<
"' nb_realloc=" << m_nb_reallocate
254 <<
" realloc_copy=" << m_reallocate_size <<
"\n";
255 m_block_wrapper.dumpStats(ostr(), m_allocator_name);
257 tm->
info() << ostr.str();
259 std::cout << ostr.str();
262 m_memory_pool.freeCachedMemory();
267 bool hasRealloc(MemoryAllocationArgs)
const final {
return true; }
268 AllocatedMemoryInfo allocate(MemoryAllocationArgs args, Int64 new_size)
final
270 void* out = m_sub_allocator->allocateMemory(new_size);
271 Int64 a =
reinterpret_cast<Int64
>(out);
273 ARCANE_FATAL(
"Bad alignment for CUDA allocator: offset={0}", (a % 128));
274 m_tracer.traceAllocate(out, new_size, args);
275 _applyHint(out, new_size, args);
276 return { out, new_size };
278 AllocatedMemoryInfo reallocate(MemoryAllocationArgs args, AllocatedMemoryInfo current_info, Int64 new_size)
final
281 Int64 current_size = current_info.size();
282 m_reallocate_size += current_size;
283 String array_name = args.arrayName();
284 const bool do_print = (m_print_level >= 2);
286 std::cout <<
"Reallocate allocator=" << m_allocator_name
287 <<
" current_size=" << current_size
288 <<
" current_capacity=" << current_info.capacity()
289 <<
" new_capacity=" << new_size
290 <<
" ptr=" << current_info.baseAddress();
291 if (array_name.
null() && m_print_level >= 3) {
295 std::cout <<
" name=" << array_name;
296 if (m_print_level >= 4)
301 if (m_use_memory_pool)
302 _removeHint(current_info.baseAddress(), current_size, args);
303 AllocatedMemoryInfo a = allocate(args, new_size);
305 ARCANE_CHECK_CUDA(cudaMemcpy(a.baseAddress(), current_info.baseAddress(), current_size, cudaMemcpyDefault));
306 deallocate(args, current_info);
309 void deallocate(MemoryAllocationArgs args, AllocatedMemoryInfo mem_info)
final
311 void* ptr = mem_info.baseAddress();
312 size_t mem_size = mem_info.capacity();
313 if (m_use_memory_pool)
314 _removeHint(ptr, mem_size, args);
318 m_tracer.traceDeallocate(mem_info, args);
319 m_sub_allocator->freeMemory(ptr, mem_size);
322 Int64 adjustedCapacity(MemoryAllocationArgs args, Int64 wanted_capacity, Int64 element_size)
const final
325 return m_block_wrapper.adjustedCapacity(wanted_capacity, element_size);
330 virtual void _applyHint([[maybe_unused]]
void* ptr, [[maybe_unused]]
size_t new_size,
331 [[maybe_unused]] MemoryAllocationArgs args) {}
332 virtual void _removeHint([[maybe_unused]]
void* ptr, [[maybe_unused]]
size_t new_size,
333 [[maybe_unused]] MemoryAllocationArgs args) {}
337 impl::MemoryTracerWrapper m_tracer;
338 std::unique_ptr<ConcreteAllocator> m_concrete_allocator;
340 MemoryPool m_memory_pool;
341 IMemoryPoolAllocator* m_sub_allocator =
nullptr;
342 bool m_use_memory_pool =
false;
343 String m_allocator_name;
344 std::atomic<Int32> m_nb_reallocate = 0;
345 std::atomic<Int64> m_reallocate_size = 0;
346 Int32 m_print_level = 0;
350 BlockAllocatorWrapper m_block_wrapper;
354 void _setTraceLevel(Int32 v) { m_tracer.setTraceLevel(v); }
356 void _setUseMemoryPool(
bool is_used)
358 IMemoryPoolAllocator* mem_pool = &m_memory_pool;
359 IMemoryPoolAllocator* direct = &m_direct_sub_allocator;
360 m_sub_allocator = (is_used) ? mem_pool : direct;
361 m_use_memory_pool = is_used;
365 ARCANE_FATAL(
"Invalid value '{0}' for memory pool max block size");
366 size_t block_size =
static_cast<size_t>(v.value());
367 m_memory_pool.setMaxCachedBlockSize(block_size);
382class UnifiedMemoryCudaMemoryAllocator
383:
public CudaMemoryAllocatorBase
395 m_use_ats = v.value();
400 cudaError_t _deallocate(
void* ptr)
final
407 return ::cudaFree(ptr);
410 cudaError_t _allocate(
void** ptr,
size_t new_size)
final
413 *ptr = ::aligned_alloc(128, new_size);
416 auto r = ::cudaMallocManaged(ptr, new_size, cudaMemAttachGlobal);
421 if (r != cudaSuccess)
434 cudaGetDevice(&device_id);
435 ARCANE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_id));
436 ARCANE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
445 bool m_use_ats =
false;
452 UnifiedMemoryCudaMemoryAllocator()
453 : CudaMemoryAllocatorBase(
"UnifiedMemoryCudaMemory", new
Allocator())
456 _setTraceLevel(v.value());
461 bool do_page_allocate =
true;
463 do_page_allocate = (v.value() != 0);
465 m_block_wrapper.initialize(page_size, do_page_allocate);
467 bool use_memory_pool =
false;
469 use_memory_pool = (v.value() &
static_cast<int>(MemoryPoolFlags::UVM)) != 0;
470 _setUseMemoryPool(use_memory_pool);
478 void* p = ptr.baseAddress();
479 Int64 s = ptr.capacity();
481 _applyHint(ptr.baseAddress(), ptr.size(), new_args);
495 cudaGetDevice(&device_id);
499 ARCANE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, device_id));
500 ARCANE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetAccessedBy, cudaCpuDeviceId));
503 ARCANE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetPreferredLocation, cudaCpuDeviceId));
507 ARCANE_CHECK_CUDA(cudaMemAdvise(p, new_size, cudaMemAdviseSetReadMostly, device_id));
510 void _removeHint(
void* p,
size_t size, MemoryAllocationArgs args)
516 ARCANE_CHECK_CUDA(cudaMemAdvise(p, size, cudaMemAdviseUnsetReadMostly, device_id));
521 bool m_use_ats =
false;