Arcane  4.1.12.0
User documentation
Loading...
Searching...
No Matches
Profiling.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* Profiling.cc (C) 2000-2026 */
9/* */
10/* Classes to manage profiling. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arccore/base/Profiling.h"
15
16#include "arccore/base/ForLoopTraceInfo.h"
17#include "arccore/base/PlatformUtils.h"
18#include "arccore/base/internal/ProfilingInternal.h"
19
20#include <iostream>
21#include <iomanip>
22#include <vector>
23#include <mutex>
24#include <map>
25#include <memory>
26#include <set>
27
28namespace
29{
30using namespace Arcane;
31
32void _printGlobalLoopInfos(std::ostream& o, const Impl::ForLoopCumulativeStat& cumulative_stat)
33{
34 Int64 nb_loop_parallel_for = cumulative_stat.nbLoopParallelFor();
35 if (nb_loop_parallel_for == 0)
36 return;
37 Int64 nb_chunk_parallel_for = cumulative_stat.nbChunkParallelFor();
38 Int64 total_time = cumulative_stat.totalTime();
39 double x = static_cast<double>(total_time);
40 double x1 = 0.0;
41 if (nb_loop_parallel_for > 0)
42 x1 = x / static_cast<double>(nb_loop_parallel_for);
43 double x2 = 0.0;
44 if (nb_chunk_parallel_for > 0)
45 x2 = x / static_cast<double>(nb_chunk_parallel_for);
46 o << "LoopStat: global_time (ms) = " << x / 1.0e6 << "\n";
47 o << "LoopStat: global_nb_loop = " << std::setw(10) << nb_loop_parallel_for << " time=" << x1 << "\n";
48 o << "LoopStat: global_nb_chunk = " << std::setw(10) << nb_chunk_parallel_for << " time=" << x2 << "\n";
49}
50
51/*---------------------------------------------------------------------------*/
52/*---------------------------------------------------------------------------*/
53
54void _dumpOneLoopListStat(std::ostream& o, const Impl::ForLoopStatInfoList& stat_list)
55{
56 struct SortedStatInfo
57 {
58 bool operator<(const SortedStatInfo& rhs) const
59 {
60 return m_stat.execTime() > rhs.m_stat.execTime();
61 }
62 String m_name;
63 Impl::ForLoopProfilingStat m_stat;
64 };
65
66 // Set to 1 to avoid division by zero.
67 Int64 cumulative_total = 1;
68
69 // Sort functions by decreasing execution time
70 std::set<SortedStatInfo> sorted_set;
71 for (const auto& x : stat_list._internalImpl()->m_stat_map) {
72 const auto& s = x.second;
73 sorted_set.insert({ x.first, s });
74 cumulative_total += s.execTime();
75 }
76
77 o << "ProfilingStat\n";
78 o << std::setw(10) << "Ncall" << std::setw(10) << "Nchunk"
79 << std::setw(11) << " T (ms)" << std::setw(10) << "Tck (ns)"
80 << " % name\n";
81
82 char old_filler = o.fill();
83 for (const auto& x : sorted_set) {
84 const Impl::ForLoopProfilingStat& s = x.m_stat;
85 Int64 nb_loop = s.nbCall();
86 Int64 nb_chunk = s.nbChunk();
87 Int64 total_time_ns = s.execTime();
88 Int64 total_time_us = total_time_ns / 1000;
89 Int64 total_time_ms = total_time_us / 1000;
90 Int64 total_time_remaining_us = total_time_us % 1000;
91 Int64 time_per_chunk = (nb_chunk == 0) ? 0 : (total_time_ns / nb_chunk);
92 Int64 per_mil = (total_time_ns * 1000) / cumulative_total;
93 Int64 percent = per_mil / 10;
94 Int64 percent_digit = per_mil % 10;
95
96 o << std::setw(10) << nb_loop << std::setw(10) << nb_chunk
97 << std::setw(7) << total_time_ms << ".";
98 o << std::setfill('0') << std::setw(3) << total_time_remaining_us << std::setfill(old_filler);
99 o << std::setw(10) << time_per_chunk
100 << std::setw(4) << percent << "." << percent_digit << " " << x.m_name << "\n";
101 }
102 o << "TOTAL=" << cumulative_total / 1000000 << "\n";
103}
104} // namespace
105
106/*---------------------------------------------------------------------------*/
107/*---------------------------------------------------------------------------*/
108
109namespace Arcane
110{
111
112/*---------------------------------------------------------------------------*/
113/*---------------------------------------------------------------------------*/
114
115Impl::ForLoopStatInfoList::
116ForLoopStatInfoList()
117: m_p(new ForLoopStatInfoListImpl())
118{
119}
120
121/*---------------------------------------------------------------------------*/
122/*---------------------------------------------------------------------------*/
123
124Impl::ForLoopStatInfoList::
125~ForLoopStatInfoList()
126{
127 delete m_p;
128}
129
130/*---------------------------------------------------------------------------*/
131/*---------------------------------------------------------------------------*/
132
133namespace
134{
135 Impl::ForLoopCumulativeStat global_stat;
136}
137
138/*---------------------------------------------------------------------------*/
139/*---------------------------------------------------------------------------*/
140
141Impl::ScopedStatLoop::
142ScopedStatLoop(ForLoopOneExecStat* s)
143: m_stat_info(s)
144{
145 if (m_stat_info) {
146 m_begin_time = Platform::getRealTimeNS();
147 }
148}
149
150/*---------------------------------------------------------------------------*/
151/*---------------------------------------------------------------------------*/
152
153Impl::ScopedStatLoop::
154~ScopedStatLoop()
155{
156 if (m_stat_info) {
157 Int64 end_time = Platform::getRealTimeNS();
158 m_stat_info->setBeginTime(m_begin_time);
159 m_stat_info->setEndTime(end_time);
160 }
161}
162
163/*---------------------------------------------------------------------------*/
164/*---------------------------------------------------------------------------*/
165
167{
168 public:
169
170 Impl::ForLoopStatInfoList* createForLoopStatInfoList()
171 {
172 std::lock_guard<std::mutex> lk(m_mutex);
173 std::unique_ptr<Impl::ForLoopStatInfoList> x(new Impl::ForLoopStatInfoList());
174 auto* ptr = x.get();
175 m_for_loop_stat_info_list_vector.push_back(std::move(x));
176 return ptr;
177 }
178 Impl::AcceleratorStatInfoList* createAcceleratorStatInfoList()
179 {
180 std::lock_guard<std::mutex> lk(m_mutex);
181 std::unique_ptr<Impl::AcceleratorStatInfoList> x(new Impl::AcceleratorStatInfoList());
182 auto* ptr = x.get();
183 m_accelerator_stat_info_list_vector.push_back(std::move(x));
184 return ptr;
185 }
186
187 void visitForLoop(const std::function<void(const Impl::ForLoopStatInfoList&)>& f)
188 {
189 for (const auto& x : m_for_loop_stat_info_list_vector)
190 f(*x);
191 }
192
193 void visitAccelerator(const std::function<void(const Impl::AcceleratorStatInfoList&)>& f)
194 {
195 for (const auto& x : m_accelerator_stat_info_list_vector)
196 f(*x);
197 }
198
199 public:
200
201 std::mutex m_mutex;
202 std::vector<std::unique_ptr<Impl::ForLoopStatInfoList>> m_for_loop_stat_info_list_vector;
203 std::vector<std::unique_ptr<Impl::AcceleratorStatInfoList>> m_accelerator_stat_info_list_vector;
204};
205
206/*---------------------------------------------------------------------------*/
207/*---------------------------------------------------------------------------*/
208
209AllStatInfoList global_all_stat_info_list;
210
211/*---------------------------------------------------------------------------*/
212/*---------------------------------------------------------------------------*/
213
214// Allows managing a ForLoopStatInfoList instance per thread to avoid locks
216{
217 public:
218
219 Impl::ForLoopStatInfoList* forLoopStatInfoList()
220 {
221 return _createOrGetForLoopStatInfoList();
222 }
223 Impl::AcceleratorStatInfoList* acceleratorStatInfoList()
224 {
225 return _createOrGetAcceleratorStatInfoList();
226 }
227 void merge(const ForLoopOneExecStat& stat_info, const ForLoopTraceInfo& trace_info)
228 {
229 Impl::ForLoopStatInfoList* stat_list = _createOrGetForLoopStatInfoList();
230 stat_list->merge(stat_info, trace_info);
231 }
232
233 private:
234
235 Impl::ForLoopStatInfoList* _createOrGetForLoopStatInfoList()
236 {
237 if (!m_for_loop_stat_info_list)
238 m_for_loop_stat_info_list = global_all_stat_info_list.createForLoopStatInfoList();
239 return m_for_loop_stat_info_list;
240 }
241 Impl::AcceleratorStatInfoList* _createOrGetAcceleratorStatInfoList()
242 {
243 if (!m_accelerator_stat_info_list)
244 m_accelerator_stat_info_list = global_all_stat_info_list.createAcceleratorStatInfoList();
245 return m_accelerator_stat_info_list;
246 }
247
248 private:
249
250 Impl::ForLoopStatInfoList* m_for_loop_stat_info_list = nullptr;
251 Impl::AcceleratorStatInfoList* m_accelerator_stat_info_list = nullptr;
252};
253thread_local ThreadLocalStatInfo thread_local_stat_info;
254
255/*---------------------------------------------------------------------------*/
256/*---------------------------------------------------------------------------*/
257
258Int32 ProfilingRegistry::m_profiling_level = 0;
259
260/*---------------------------------------------------------------------------*/
261/*---------------------------------------------------------------------------*/
262
265{
266 return thread_local_stat_info.forLoopStatInfoList();
267}
268
269/*---------------------------------------------------------------------------*/
270/*---------------------------------------------------------------------------*/
271
272Impl::ForLoopStatInfoList* ProfilingRegistry::
273_threadLocalForLoopInstance()
274{
275 return thread_local_stat_info.forLoopStatInfoList();
276}
277
278/*---------------------------------------------------------------------------*/
279/*---------------------------------------------------------------------------*/
280
281Impl::AcceleratorStatInfoList* ProfilingRegistry::
282_threadLocalAcceleratorInstance()
283{
284 return thread_local_stat_info.acceleratorStatInfoList();
285}
286
287/*---------------------------------------------------------------------------*/
288/*---------------------------------------------------------------------------*/
289
291visitLoopStat(const std::function<void(const Impl::ForLoopStatInfoList&)>& f)
292{
293 global_all_stat_info_list.visitForLoop(f);
294}
295
296/*---------------------------------------------------------------------------*/
297/*---------------------------------------------------------------------------*/
298
300visitAcceleratorStat(const std::function<void(const Impl::AcceleratorStatInfoList&)>& f)
301{
302 global_all_stat_info_list.visitAccelerator(f);
303}
304
305/*---------------------------------------------------------------------------*/
306/*---------------------------------------------------------------------------*/
307
310{
311 m_profiling_level = level;
312}
313
314/*---------------------------------------------------------------------------*/
315/*---------------------------------------------------------------------------*/
316
317const Impl::ForLoopCumulativeStat& ProfilingRegistry::
318globalLoopStat()
319{
320 return global_stat;
321}
322
323/*---------------------------------------------------------------------------*/
324/*---------------------------------------------------------------------------*/
325
326/*---------------------------------------------------------------------------*/
327/*---------------------------------------------------------------------------*/
328
329void Impl::ForLoopProfilingStat::
330add(const ForLoopOneExecStat& s)
331{
332 ++m_nb_call;
333 m_nb_chunk += s.nbChunk();
334 m_exec_time += s.execTime();
335}
336
337/*---------------------------------------------------------------------------*/
338/*---------------------------------------------------------------------------*/
339
340/*---------------------------------------------------------------------------*/
341/*---------------------------------------------------------------------------*/
342
343void Impl::ForLoopStatInfoList::
344merge(const ForLoopOneExecStat& loop_stat_info, const ForLoopTraceInfo& loop_trace_info)
345{
346 global_stat.merge(loop_stat_info);
347 String loop_name = "Unknown";
348 if (loop_trace_info.isValid()) {
349 loop_name = loop_trace_info.loopName();
350 if (loop_name.empty())
351 loop_name = loop_trace_info.traceInfo().name();
352 }
353 m_p->m_stat_map[loop_name].add(loop_stat_info);
354}
355
356/*---------------------------------------------------------------------------*/
357/*---------------------------------------------------------------------------*/
358
359void Impl::AcceleratorStatInfoList::
360print(std::ostream& o) const
361{
362 const auto& htod = memoryTransfer(eMemoryTransferType::HostToDevice);
363 const auto& dtoh = memoryTransfer(eMemoryTransferType::DeviceToHost);
364 o << "MemoryTransferSTATS: HTOD = " << htod.m_nb_byte << " (" << htod.m_nb_call << ")"
365 << " DTOH = " << dtoh.m_nb_byte << " (" << dtoh.m_nb_call << ")";
366 const auto& cpu_fault = memoryPageFault(eMemoryPageFaultType::Cpu);
367 const auto& gpu_fault = memoryPageFault(eMemoryPageFaultType::Gpu);
368 o << " PageFaultCPU = " << cpu_fault.m_nb_fault << " (" << cpu_fault.m_nb_call << ")"
369 << " PageFaultGPU = " << gpu_fault.m_nb_fault << " (" << gpu_fault.m_nb_call << ")";
370}
371
372/*---------------------------------------------------------------------------*/
373/*---------------------------------------------------------------------------*/
374
375void Impl::
376dumpProfilingStatistics(std::ostream& o)
377{
378 // Display profiling information on o
379 _printGlobalLoopInfos(o, ProfilingRegistry::globalLoopStat());
380 {
381 auto f = [&](const Impl::ForLoopStatInfoList& stat_list) {
382 _dumpOneLoopListStat(o, stat_list);
383 };
385 }
386 // Before displaying accelerator profiling, one should be sure
387 // that it is disabled. Normally, this is the case if ArcaneMainBatch is used.
388 {
389 auto f = [&](const Impl::AcceleratorStatInfoList& stat_list) {
390 stat_list.print(o);
391 };
393 }
394}
395
396/*---------------------------------------------------------------------------*/
397/*---------------------------------------------------------------------------*/
398
399} // End namespace Arcane
400
401/*---------------------------------------------------------------------------*/
402/*---------------------------------------------------------------------------*/
Class to manage the profiling of a single loop execution.
Int64 execTime() const
Execution time (in nanoseconds).
static void setProfilingLevel(Int32 level)
Sets the profiling level.
Definition Profiling.cc:309
static void visitAcceleratorStat(const std::function< void(const Impl::AcceleratorStatInfoList &)> &f)
Visits the accelerator statistics list.
Definition Profiling.cc:300
static Impl::ForLoopStatInfoList * threadLocalInstance()
Definition Profiling.cc:264
static void visitLoopStat(const std::function< void(const Impl::ForLoopStatInfoList &)> &f)
Visits the loop statistics list.
Definition Profiling.cc:291
bool empty() const
True if the string is empty (null or "").
Definition String.cc:317
Int64 getRealTimeNS()
Clock time in nanoseconds.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
std::int64_t Int64
Signed integer type of 64 bits.
bool operator<(const Item &item1, const Item &item2)
Compare two entities.
Definition Item.h:566
std::int32_t Int32
Signed integer type of 32 bits.