Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
ArcaneCheckpointModule.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* ArcaneCheckpointModule.cc (C) 2000-2020 */
9/* */
10/* Module managing protections/restorations. */
11/*---------------------------------------------------------------------------*/
12/*---------------------------------------------------------------------------*/
13
14#include "arcane/utils/ArcanePrecomp.h"
15
16#include "arcane/utils/ScopedPtr.h"
17#include "arcane/utils/StringBuilder.h"
18
19#include "arcane/core/ISubDomain.h"
20#include "arcane/core/EntryPoint.h"
21#include "arcane/core/Timer.h"
22#include "arcane/core/ITimeHistoryMng.h"
23#include "arcane/core/ModuleFactory.h"
24#include "arcane/core/ServiceUtils.h"
25#include "arcane/core/ICheckpointWriter.h"
26#include "arcane/core/ICheckpointMng.h"
27#include "arcane/core/Directory.h"
28#include "arcane/core/IParallelMng.h"
29
30#include "arcane/core/OutputChecker.h"
31#include "arcane/std/ArcaneCheckpoint_axl.h"
32
33/*---------------------------------------------------------------------------*/
34/*---------------------------------------------------------------------------*/
35
36namespace Arcane
37{
38
39/*---------------------------------------------------------------------------*/
40/*---------------------------------------------------------------------------*/
41
45class ArcaneCheckpointModule
46: public ArcaneArcaneCheckpointObject
47{
48 public:
49
50 ArcaneCheckpointModule(const ModuleBuilder& cb);
51 ~ArcaneCheckpointModule();
52
53 public:
54
55 virtual VersionInfo versionInfo() const { return VersionInfo(0, 9, 1); }
56
57 public:
58
59 virtual void checkpointCheckAndWriteData();
60 virtual void checkpointStartInit();
61 virtual void checkpointInit();
62 virtual void checkpointExit();
63
64 private:
65
66 OutputChecker m_output_checker;
67 Timer* m_checkpoint_timer;
68 ICheckpointWriter* m_checkpoint_writer;
69 String m_checkpoint_dirname;
70
71 private:
72
73 void _doCheckpoint(bool save_history);
74 void _dumpStats();
75 void _getCheckpointService();
76 void _setDirectoryName();
77 bool _checkHasOutput();
78};
79
80/*---------------------------------------------------------------------------*/
81/*---------------------------------------------------------------------------*/
82
83ARCANE_REGISTER_MODULE_ARCANECHECKPOINT(ArcaneCheckpointModule);
84
85/*---------------------------------------------------------------------------*/
86/*---------------------------------------------------------------------------*/
87
88ArcaneCheckpointModule::
89ArcaneCheckpointModule(const ModuleBuildInfo& mb)
91, m_output_checker(mb.subDomain(), "CheckpointRestart")
92, m_checkpoint_timer(0)
93, m_checkpoint_writer(0)
94, m_checkpoint_dirname(".")
95{
96 m_checkpoint_timer = new Timer(mb.subDomain(), "Checkpoint", Timer::TimerReal);
97 m_output_checker.assignIteration(&m_next_iteration, &options()->period);
98 m_output_checker.assignGlobalTime(&m_next_global_time, &options()->frequency);
99 m_output_checker.assignCPUTime(&m_next_cpu_time, &options()->frequencyCpu);
100}
101
102/*---------------------------------------------------------------------------*/
103/*---------------------------------------------------------------------------*/
104
105ArcaneCheckpointModule::
106~ArcaneCheckpointModule()
107{
108 delete m_checkpoint_timer;
109}
110
111/*---------------------------------------------------------------------------*/
112/*---------------------------------------------------------------------------*/
113
114void ArcaneCheckpointModule::
115checkpointInit()
116{
117 // Unlike other output types, the CPU time for the next output must be reset
118 // to zero because the CPU time used belongs to the current execution.
119 m_next_cpu_time = options()->frequencyCpu();
120 info() << " -------------------------------------------";
121 info() << "| PROTECTION-REPRISE |";
122 info() << " -------------------------------------------";
123 info() << " ";
124 //info() << " Uses the service '" << options()->checkpointServiceName()
125 // << "' for protections";
126 info() << " ";
127 m_output_checker.initialize();
128 info() << " ";
129 // If protections are active, check that the specified service
130 // exists
131 if (options()->doDumpAtEnd()) {
132 info() << "Protection required at the end of computations";
133 _getCheckpointService();
134 }
135}
136
137/*---------------------------------------------------------------------------*/
138/*---------------------------------------------------------------------------*/
139
140bool ArcaneCheckpointModule::
141_checkHasOutput()
142{
143 Real old_time = m_global_old_time();
144 Real current_time = m_global_time();
145 Integer iteration = m_global_iteration();
147
148 bool do_output = m_output_checker.check(old_time, current_time, iteration, cpu_used);
149 return do_output;
150}
151
152/*---------------------------------------------------------------------------*/
153/*---------------------------------------------------------------------------*/
154
155void ArcaneCheckpointModule::
156checkpointStartInit()
157{
158 m_next_global_time = 0.;
160 m_next_cpu_time = 0;
161
162 // Initializes the output checker. This must be done here if we
163 // want to save things at iteration 1.
164 _checkHasOutput();
165}
166
167/*---------------------------------------------------------------------------*/
168/*---------------------------------------------------------------------------*/
176{
177 if (!options()->doDumpAtEnd())
178 return;
179
180 _doCheckpoint(false);
181 _dumpStats();
182
183 if (m_checkpoint_writer)
184 m_checkpoint_writer->close();
185 m_checkpoint_writer = 0;
186}
187
188/*---------------------------------------------------------------------------*/
189/*---------------------------------------------------------------------------*/
190
191void ArcaneCheckpointModule::
192_dumpStats()
193{
194 // Displays execution statistics
195 Real total_time = m_checkpoint_timer->totalTime();
196 info() << "Total time spent in protection output (second): " << total_time;
197 Integer nb_time = m_checkpoint_timer->nbActivated();
198 if (nb_time != 0)
199 info() << "Average time per output (second): " << total_time / nb_time
200 << " (for " << nb_time << " outputs";
201}
202
203/*---------------------------------------------------------------------------*/
204/*---------------------------------------------------------------------------*/
205
206void ArcaneCheckpointModule::
207_setDirectoryName()
208{
209 Directory export_dir = subDomain()->storageDirectory();
210 if (export_dir.path().null())
211 export_dir = subDomain()->exportDirectory();
212
213 Directory output_directory = Directory(export_dir, "protection");
214 IParallelMng* pm = parallelMng();
215 if (pm->isMasterIO())
216 output_directory.createDirectory();
217 pm->barrier();
218
219 m_checkpoint_dirname = output_directory.path();
220}
221
222/*---------------------------------------------------------------------------*/
223/*---------------------------------------------------------------------------*/
224
231_doCheckpoint(bool save_history)
232{
233 {
234 Timer::Sentry sentry(m_checkpoint_timer);
235 Timer::Phase tp(subDomain(), TP_InputOutput);
236 if (!m_checkpoint_writer)
237 _getCheckpointService();
238 if (m_checkpoint_writer) {
239 Integer nb_checkpoint = m_checkpoints_time.size();
240 m_checkpoints_time.resize(nb_checkpoint + 1);
241 Real checkpoint_time = m_global_time();
242 m_checkpoints_time[nb_checkpoint] = checkpoint_time;
243 m_checkpoint_writer->setCheckpointTimes(m_checkpoints_time);
244 m_checkpoint_writer->setBaseDirectoryName(m_checkpoint_dirname);
245 info() << "**** Protection active at time " << checkpoint_time
246 << " directory=" << m_checkpoint_dirname
247 << " number " << nb_checkpoint << " ******";
248 subDomain()->checkpointMng()->writeDefaultCheckpoint(m_checkpoint_writer);
249 }
250 }
251 info() << "Checkpoint write time (second): "
252 << m_checkpoint_timer->lastActivationTime();
253
254 // Saves histories
255 if (save_history)
257}
258
259/*---------------------------------------------------------------------------*/
260/*---------------------------------------------------------------------------*/
261
262void ArcaneCheckpointModule::
263_getCheckpointService()
264{
265 ICheckpointWriter* checkpoint = options()->checkpointService();
266 if (!checkpoint) {
267 StringUniqueArray valid_values;
268 options()->checkpointService.getAvailableNames(valid_values);
269 pfatal() << "Protections required but protection/restore service selected ("
270 << options()->checkpointService.serviceName() << ") not available "
271 << "(valid values: " << String::join(", ", valid_values) << ")";
272 }
273 m_checkpoint_writer = checkpoint;
274 _setDirectoryName();
275}
276
277/*---------------------------------------------------------------------------*/
278/*---------------------------------------------------------------------------*/
279
286{
287 bool do_output = _checkHasOutput();
288 if (!do_output)
289 return;
290
291 info() << "Protection required.";
292 // TEMPORARY:
293 // Unlike end-of-execution checkpointing, this occurs at the end of the
294 // time loop, but before the iteration number is incremented. If we resume
295 // at this time, the iteration number will be incorrect. To correct this
296 // problem, we increment this number before the checkpoint and reset it
297 // to the correct value afterward.
298 // SDC: this problem has not been observed. It seems to me that it is the
299 // current iteration number that must be saved. Changed for a restart issue
300 // in an IFPEN application (internally Bugzilla 778.
301 // m_global_iteration = m_global_iteration() + 1;
302 _doCheckpoint(true);
303 // m_global_iteration = m_global_iteration() - 1;
304}
305
306/*---------------------------------------------------------------------------*/
307/*---------------------------------------------------------------------------*/
308
309} // End namespace Arcane
310
311/*---------------------------------------------------------------------------*/
312/*---------------------------------------------------------------------------*/
Generation de la classe de base du Module.
CaseOptionsArcaneCheckpoint * options() const
Options du jeu de données du module.
Arcane::VariableScalarInteger m_next_iteration
Variables du module.
ISubDomain * subDomain() const override
Sub-domain associated with the module.
IParallelMng * parallelMng() const override
Message passing parallelism manager.
virtual void checkpointCheckAndWriteData()
Checks if a checkpoint should be performed at this moment and performs it if necessary.
virtual VersionInfo versionInfo() const
Module version.
void _doCheckpoint(bool save_history)
Performs a checkpoint.
virtual void checkpointExit()
End-of-calculation operations.
VariableScalarInt32 m_global_iteration
Current iteration.
VariableScalarReal m_global_cpu_time
CPU time used (in seconds).
VariableScalarReal m_global_time
Current time.
VariableScalarReal m_global_old_time
Time previous to the current time.
virtual void writeDefaultCheckpoint(ICheckpointWriter *writer)=0
Writes a checkpoint using the writer.
Interface of the checkpoint/recovery write service.
virtual const IDirectory & storageDirectory() const =0
Base directory for exports requiring archiving.
virtual ICheckpointMng * checkpointMng() const =0
Protection manager.
virtual ITimeHistoryMng * timeHistoryMng()=0
Returns the history manager.
virtual const IDirectory & exportDirectory() const =0
Base directory for exports.
virtual void dumpHistory(bool is_verbose)=0
Saves the history.
Manages outputs based on physical time, CPU time, or a number of iterations.
Positions the phase of the currently executing action.
Definition Timer.h:142
Sentinel for the timer. The sentinel associated with a timer allows it to be triggered upon its const...
Definition Timer.h:90
Management of a timer.
Definition Timer.h:63
Integer nbActivated() const
Returns the number of times the timer has been activated.
Definition Timer.h:246
@ TimerReal
Timer using real time.
Definition Timer.h:77
Real totalTime() const
Returns the total time (in seconds) spent in the timer.
Definition Timer.h:240
TraceMessage pfatal() const
Flow for a parallel fatal error message.
TraceMessage info() const
Flow for an information message.
Information about a version.
Definition VersionInfo.h:47
Integer toInteger(Real r)
Converts a Real to Integer.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
Int32 Integer
Type representing an integer.
double Real
Type representing a real number.
UniqueArray< String > StringUniqueArray
Dynamic 1D array of strings.
Definition UtilsTypes.h:359