14#include "arccore/base/Profiling.h"
16#include "arccore/base/ForLoopTraceInfo.h"
17#include "arccore/base/PlatformUtils.h"
18#include "arccore/base/internal/ProfilingInternal.h"
32void _printGlobalLoopInfos(std::ostream& o,
const Impl::ForLoopCumulativeStat& cumulative_stat)
34 Int64 nb_loop_parallel_for = cumulative_stat.nbLoopParallelFor();
35 if (nb_loop_parallel_for == 0)
37 Int64 nb_chunk_parallel_for = cumulative_stat.nbChunkParallelFor();
38 Int64 total_time = cumulative_stat.totalTime();
39 double x =
static_cast<double>(total_time);
41 if (nb_loop_parallel_for > 0)
42 x1 = x /
static_cast<double>(nb_loop_parallel_for);
44 if (nb_chunk_parallel_for > 0)
45 x2 = x /
static_cast<double>(nb_chunk_parallel_for);
46 o <<
"LoopStat: global_time (ms) = " << x / 1.0e6 <<
"\n";
47 o <<
"LoopStat: global_nb_loop = " << std::setw(10) << nb_loop_parallel_for <<
" time=" << x1 <<
"\n";
48 o <<
"LoopStat: global_nb_chunk = " << std::setw(10) << nb_chunk_parallel_for <<
" time=" << x2 <<
"\n";
58 bool operator<(
const SortedStatInfo& rhs)
const
60 return m_stat.execTime() > rhs.m_stat.execTime();
63 Impl::ForLoopProfilingStat m_stat;
67 Int64 cumulative_total = 1;
70 std::set<SortedStatInfo> sorted_set;
71 for (
const auto& x : stat_list._internalImpl()->m_stat_map) {
72 const auto& s = x.second;
73 sorted_set.insert({ x.first, s });
74 cumulative_total += s.execTime();
77 o <<
"ProfilingStat\n";
78 o << std::setw(10) <<
"Ncall" << std::setw(10) <<
"Nchunk"
79 << std::setw(11) <<
" T (ms)" << std::setw(10) <<
"Tck (ns)"
82 char old_filler = o.fill();
83 for (
const auto& x : sorted_set) {
84 const Impl::ForLoopProfilingStat& s = x.m_stat;
85 Int64 nb_loop = s.nbCall();
86 Int64 nb_chunk = s.nbChunk();
87 Int64 total_time_ns = s.execTime();
88 Int64 total_time_us = total_time_ns / 1000;
89 Int64 total_time_ms = total_time_us / 1000;
90 Int64 total_time_remaining_us = total_time_us % 1000;
91 Int64 time_per_chunk = (nb_chunk == 0) ? 0 : (total_time_ns / nb_chunk);
92 Int64 per_mil = (total_time_ns * 1000) / cumulative_total;
93 Int64 percent = per_mil / 10;
94 Int64 percent_digit = per_mil % 10;
96 o << std::setw(10) << nb_loop << std::setw(10) << nb_chunk
97 << std::setw(7) << total_time_ms <<
".";
98 o << std::setfill(
'0') << std::setw(3) << total_time_remaining_us << std::setfill(old_filler);
99 o << std::setw(10) << time_per_chunk
100 << std::setw(4) << percent <<
"." << percent_digit <<
" " << x.m_name <<
"\n";
102 o <<
"TOTAL=" << cumulative_total / 1000000 <<
"\n";
115Impl::ForLoopStatInfoList::
117: m_p(new ForLoopStatInfoListImpl())
124Impl::ForLoopStatInfoList::
125~ForLoopStatInfoList()
135 Impl::ForLoopCumulativeStat global_stat;
141Impl::ScopedStatLoop::
142ScopedStatLoop(ForLoopOneExecStat* s)
153Impl::ScopedStatLoop::
158 m_stat_info->setBeginTime(m_begin_time);
159 m_stat_info->setEndTime(end_time);
172 std::lock_guard<std::mutex> lk(m_mutex);
175 m_for_loop_stat_info_list_vector.push_back(std::move(x));
178 Impl::AcceleratorStatInfoList* createAcceleratorStatInfoList()
180 std::lock_guard<std::mutex> lk(m_mutex);
181 std::unique_ptr<Impl::AcceleratorStatInfoList> x(
new Impl::AcceleratorStatInfoList());
183 m_accelerator_stat_info_list_vector.push_back(std::move(x));
189 for (
const auto& x : m_for_loop_stat_info_list_vector)
193 void visitAccelerator(
const std::function<
void(
const Impl::AcceleratorStatInfoList&)>& f)
195 for (
const auto& x : m_accelerator_stat_info_list_vector)
202 std::vector<std::unique_ptr<Impl::ForLoopStatInfoList>> m_for_loop_stat_info_list_vector;
203 std::vector<std::unique_ptr<Impl::AcceleratorStatInfoList>> m_accelerator_stat_info_list_vector;
221 return _createOrGetForLoopStatInfoList();
223 Impl::AcceleratorStatInfoList* acceleratorStatInfoList()
225 return _createOrGetAcceleratorStatInfoList();
230 stat_list->merge(stat_info, trace_info);
237 if (!m_for_loop_stat_info_list)
238 m_for_loop_stat_info_list = global_all_stat_info_list.createForLoopStatInfoList();
239 return m_for_loop_stat_info_list;
241 Impl::AcceleratorStatInfoList* _createOrGetAcceleratorStatInfoList()
243 if (!m_accelerator_stat_info_list)
244 m_accelerator_stat_info_list = global_all_stat_info_list.createAcceleratorStatInfoList();
245 return m_accelerator_stat_info_list;
251 Impl::AcceleratorStatInfoList* m_accelerator_stat_info_list =
nullptr;
258Int32 ProfilingRegistry::m_profiling_level = 0;
266 return thread_local_stat_info.forLoopStatInfoList();
273_threadLocalForLoopInstance()
275 return thread_local_stat_info.forLoopStatInfoList();
281Impl::AcceleratorStatInfoList* ProfilingRegistry::
282_threadLocalAcceleratorInstance()
284 return thread_local_stat_info.acceleratorStatInfoList();
293 global_all_stat_info_list.visitForLoop(f);
302 global_all_stat_info_list.visitAccelerator(f);
311 m_profiling_level = level;
317const Impl::ForLoopCumulativeStat& ProfilingRegistry::
329void Impl::ForLoopProfilingStat::
343void Impl::ForLoopStatInfoList::
344merge(
const ForLoopOneExecStat& loop_stat_info,
const ForLoopTraceInfo& loop_trace_info)
346 global_stat.merge(loop_stat_info);
347 String loop_name =
"Unknown";
348 if (loop_trace_info.isValid()) {
349 loop_name = loop_trace_info.loopName();
350 if (loop_name.
empty())
351 loop_name = loop_trace_info.traceInfo().name();
353 m_p->m_stat_map[loop_name].add(loop_stat_info);
359void Impl::AcceleratorStatInfoList::
360print(std::ostream& o)
const
362 const auto& htod = memoryTransfer(eMemoryTransferType::HostToDevice);
363 const auto& dtoh = memoryTransfer(eMemoryTransferType::DeviceToHost);
364 o <<
"MemoryTransferSTATS: HTOD = " << htod.m_nb_byte <<
" (" << htod.m_nb_call <<
")"
365 <<
" DTOH = " << dtoh.m_nb_byte <<
" (" << dtoh.m_nb_call <<
")";
366 const auto& cpu_fault = memoryPageFault(eMemoryPageFaultType::Cpu);
367 const auto& gpu_fault = memoryPageFault(eMemoryPageFaultType::Gpu);
368 o <<
" PageFaultCPU = " << cpu_fault.m_nb_fault <<
" (" << cpu_fault.m_nb_call <<
")"
369 <<
" PageFaultGPU = " << gpu_fault.m_nb_fault <<
" (" << gpu_fault.m_nb_call <<
")";
376dumpProfilingStatistics(std::ostream& o)
379 _printGlobalLoopInfos(o, ProfilingRegistry::globalLoopStat());
381 auto f = [&](
const Impl::ForLoopStatInfoList& stat_list) {
382 _dumpOneLoopListStat(o, stat_list);
389 auto f = [&](
const Impl::AcceleratorStatInfoList& stat_list) {
Class to manage the profiling of a single loop execution.
Int64 nbChunk() const
Number of chunks.
Int64 execTime() const
Execution time (in nanoseconds).
Trace information for a 'for' loop.
Loop execution statistics.
static void setProfilingLevel(Int32 level)
Sets the profiling level.
static void visitAcceleratorStat(const std::function< void(const Impl::AcceleratorStatInfoList &)> &f)
Visits the accelerator statistics list.
static Impl::ForLoopStatInfoList * threadLocalInstance()
static void visitLoopStat(const std::function< void(const Impl::ForLoopStatInfoList &)> &f)
Visits the loop statistics list.
bool empty() const
True if the string is empty (null or "").
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
bool operator<(const Item &item1, const Item &item2)
Compare two entities.
std::int32_t Int32
Signed integer type of 32 bits.