14#include "arccore/base/Profiling.h"
16#include "arccore/base/ForLoopTraceInfo.h"
17#include "arccore/base/PlatformUtils.h"
18#include "arccore/base/internal/ProfilingInternal.h"
34 Int64 nb_loop_parallel_for = cumulative_stat.nbLoopParallelFor();
35 if (nb_loop_parallel_for == 0)
37 Int64 nb_chunk_parallel_for = cumulative_stat.nbChunkParallelFor();
38 Int64 total_time = cumulative_stat.totalTime();
39 double x =
static_cast<double>(total_time);
41 if (nb_loop_parallel_for > 0)
42 x1 = x /
static_cast<double>(nb_loop_parallel_for);
44 if (nb_chunk_parallel_for > 0)
45 x2 = x /
static_cast<double>(nb_chunk_parallel_for);
46 o <<
"LoopStat: global_time (ms) = " << x / 1.0e6 <<
"\n";
47 o <<
"LoopStat: global_nb_loop = " << std::setw(10) << nb_loop_parallel_for <<
" time=" << x1 <<
"\n";
48 o <<
"LoopStat: global_nb_chunk = " << std::setw(10) << nb_chunk_parallel_for <<
" time=" << x2 <<
"\n";
58 bool operator<(
const SortedStatInfo& rhs)
const
60 return m_stat.execTime() > rhs.m_stat.execTime();
63 Impl::ForLoopProfilingStat m_stat;
67 Int64 cumulative_total = 1;
70 std::set<SortedStatInfo> sorted_set;
72 const auto& s = x.second;
73 sorted_set.insert({ x.first, s });
74 cumulative_total += s.execTime();
77 o <<
"ProfilingStat\n";
78 o << std::setw(10) <<
"Ncall" << std::setw(10) <<
"Nchunk"
79 << std::setw(11) <<
" T (ms)" << std::setw(10) <<
"Tck (ns)"
82 char old_filler = o.fill();
83 for (
const auto& x : sorted_set) {
85 Int64 nb_loop = s.nbCall();
86 Int64 nb_chunk = s.nbChunk();
87 Int64 total_time_ns = s.execTime();
88 Int64 total_time_us = total_time_ns / 1000;
89 Int64 total_time_ms = total_time_us / 1000;
90 Int64 total_time_remaining_us = total_time_us % 1000;
91 Int64 time_per_chunk = (nb_chunk == 0) ? 0 : (total_time_ns / nb_chunk);
92 Int64 per_mil = (total_time_ns * 1000) / cumulative_total;
93 Int64 percent = per_mil / 10;
94 Int64 percent_digit = per_mil % 10;
96 o << std::setw(10) << nb_loop << std::setw(10) << nb_chunk
97 << std::setw(7) << total_time_ms <<
".";
98 o << std::setfill(
'0') << std::setw(3) << total_time_remaining_us << std::setfill(old_filler);
99 o << std::setw(10) << time_per_chunk
100 << std::setw(4) << percent <<
"." << percent_digit <<
" " << x.m_name <<
"\n";
102 o <<
"TOTAL=" << cumulative_total / 1000000 <<
"\n";
115Impl::ForLoopStatInfoList::
117: m_p(new ForLoopStatInfoListImpl())
124Impl::ForLoopStatInfoList::
125~ForLoopStatInfoList()
135 Impl::ForLoopCumulativeStat global_stat;
141Impl::ScopedStatLoop::
142ScopedStatLoop(ForLoopOneExecStat* s)
153Impl::ScopedStatLoop::
158 m_stat_info->setBeginTime(m_begin_time);
159 m_stat_info->setEndTime(end_time);
172 std::lock_guard<std::mutex> lk(m_mutex);
175 m_for_loop_stat_info_list_vector.push_back(std::move(x));
180 std::lock_guard<std::mutex> lk(m_mutex);
183 m_accelerator_stat_info_list_vector.push_back(std::move(x));
189 for (
const auto& x : m_for_loop_stat_info_list_vector)
195 for (
const auto& x : m_accelerator_stat_info_list_vector)
202 std::vector<std::unique_ptr<Impl::ForLoopStatInfoList>> m_for_loop_stat_info_list_vector;
203 std::vector<std::unique_ptr<Impl::AcceleratorStatInfoList>> m_accelerator_stat_info_list_vector;
221 return _createOrGetForLoopStatInfoList();
225 return _createOrGetAcceleratorStatInfoList();
230 stat_list->merge(stat_info, trace_info);
237 if (!m_for_loop_stat_info_list)
238 m_for_loop_stat_info_list = global_all_stat_info_list.createForLoopStatInfoList();
239 return m_for_loop_stat_info_list;
243 if (!m_accelerator_stat_info_list)
244 m_accelerator_stat_info_list = global_all_stat_info_list.createAcceleratorStatInfoList();
245 return m_accelerator_stat_info_list;
258Int32 ProfilingRegistry::m_profiling_level = 0;
266 return thread_local_stat_info.forLoopStatInfoList();
275 return thread_local_stat_info.forLoopStatInfoList();
284 return thread_local_stat_info.acceleratorStatInfoList();
293 global_all_stat_info_list.visitForLoop(f);
302 global_all_stat_info_list.visitAccelerator(f);
311 m_profiling_level = level;
343void Impl::ForLoopStatInfoList::
346 global_stat.merge(loop_stat_info);
347 String loop_name =
"Unknown";
348 if (loop_trace_info.isValid()) {
349 loop_name = loop_trace_info.loopName();
350 if (loop_name.
empty())
351 loop_name = loop_trace_info.traceInfo().name();
353 m_p->m_stat_map[loop_name].add(loop_stat_info);
359void Impl::AcceleratorStatInfoList::
360print(std::ostream& o)
const
362 const auto& htod = memoryTransfer(eMemoryTransferType::HostToDevice);
363 const auto& dtoh = memoryTransfer(eMemoryTransferType::DeviceToHost);
364 o <<
"MemoryTransferSTATS: HTOD = " << htod.m_nb_byte <<
" (" << htod.m_nb_call <<
")"
365 <<
" DTOH = " << dtoh.m_nb_byte <<
" (" << dtoh.m_nb_call <<
")";
366 const auto& cpu_fault = memoryPageFault(eMemoryPageFaultType::Cpu);
367 const auto& gpu_fault = memoryPageFault(eMemoryPageFaultType::Gpu);
368 o <<
" PageFaultCPU = " << cpu_fault.m_nb_fault <<
" (" << cpu_fault.m_nb_call <<
")"
369 <<
" PageFaultGPU = " << gpu_fault.m_nb_fault <<
" (" << gpu_fault.m_nb_call <<
")";
376dumpProfilingStatistics(std::ostream& o)
379 _printGlobalLoopInfos(o, ProfilingRegistry::globalLoopStat());
381 auto f = [&](
const Impl::ForLoopStatInfoList& stat_list) {
382 _dumpOneLoopListStat(o, stat_list);
389 auto f = [&](
const Impl::AcceleratorStatInfoList& stat_list) {
Classe pour gérer le profiling d'une seule exécution d'une boucle.
Int64 nbChunk() const
Nombre de chunks.
Int64 execTime() const
Temps d'exécution (en nanoseconde).
Informations de trace pour une boucle 'for'.
Statistiques pour les accélérateurs.
Statistiques cumulées sur le nombre de boucles exécutées.
Statistiques d'exécution des boucles.
ForLoopStatInfoListImpl * _internalImpl() const
Type opaque pour l'implémentation interne.
static void setProfilingLevel(Int32 level)
Positionne le niveau de profilage.
static Impl::AcceleratorStatInfoList * _threadLocalAcceleratorInstance()
static void visitAcceleratorStat(const std::function< void(const Impl::AcceleratorStatInfoList &)> &f)
Visite la liste des statistiques sur accélérateur.
static Impl::ForLoopStatInfoList * threadLocalInstance()
static Impl::ForLoopStatInfoList * _threadLocalForLoopInstance()
static void visitLoopStat(const std::function< void(const Impl::ForLoopStatInfoList &)> &f)
Visite la liste des statistiques des boucles.
Chaîne de caractères unicode.
bool empty() const
Vrai si la chaîne est vide (nulle ou "")
-*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
std::int64_t Int64
Type entier signé sur 64 bits.
bool operator<(const Item &item1, const Item &item2)
Compare deux entités.
std::int32_t Int32
Type entier signé sur 32 bits.
Statistiques d'exécution d'une boucle.
void add(const ForLoopOneExecStat &s)
Ajoute les infos de l'exécution s.