14#include "arcane/utils/ArcanePrecomp.h"
15#include "arcane/utils/TraceAccessor.h"
16#include "arcane/utils/String.h"
17#include "arcane/utils/ScopedPtr.h"
18#include "arcane/utils/IOException.h"
19#include "arcane/utils/TraceInfo.h"
20#include "arcane/utils/Array.h"
21#include "arcane/utils/NotImplementedException.h"
22#include "arcane/utils/ITraceMng.h"
24#include "arcane/core/Directory.h"
25#include "arcane/core/ICheckpointMng.h"
26#include "arcane/core/ISubDomain.h"
27#include "arcane/core/IParallelMng.h"
28#include "arcane/core/IParallelReplication.h"
29#include "arcane/core/IRessourceMng.h"
30#include "arcane/core/IVariableMng.h"
31#include "arcane/core/IIOMng.h"
32#include "arcane/core/IXmlDocumentHolder.h"
33#include "arcane/core/XmlNode.h"
34#include "arcane/core/ICheckpointReader.h"
35#include "arcane/core/ICheckpointWriter.h"
36#include "arcane/core/ServiceBuilder.h"
37#include "arcane/core/IObservable.h"
38#include "arcane/core/CheckpointInfo.h"
39#include "arcane/core/SubDomainBuildInfo.h"
40#include "arcane/core/MeshPartInfo.h"
42#include "arcane/core/VariableCollection.h"
43#include "arcane/core/IVariable.h"
44#include "arcane/core/IMeshModifier.h"
45#include "arcane/core/ItemGroup.h"
46#include "arcane/core/IItemFamily.h"
47#include "arcane/core/IMainFactory.h"
48#include "arcane/core/IPrimaryMesh.h"
76 ~CheckpointMng()
override;
130CheckpointMng(ISubDomain* sd)
131: TraceAccessor(sd->traceMng())
133, m_write_observable(IObservable::createDefault())
134, m_read_observable(IObservable::createDefault())
177 m_sub_domain->variableMng()->readCheckpoint(reader);
199 String info_file_name(m_sub_domain->exportDirectory().file(
"checkpoint_info.xml"));
201 IIOMng* io_mng = m_sub_domain->ioMng();
203 CheckpointInfo checkpoint_info = _readCheckpointInfo(bytes, info_file_name);
204 return checkpoint_info;
214 _readCheckpoint(checkpoint_info);
223 CheckpointInfo checkpoint_info = _readCheckpointInfo(bytes_infos,
"unknown");
224 _readCheckpoint(checkpoint_info);
233 String buf_name2 = buf_name;
234 if (buf_name2.
null())
235 buf_name2 =
"unknown";
236 CheckpointInfo checkpoint_info = _readCheckpointInfo(bytes_infos, buf_name2);
237 return checkpoint_info;
251 checkpoint_info.setSubDomainRank(rank);
253 checkpoint_info.setReplicationRank(replication_rank);
258 XmlNode doc_node = xml_doc->documentNode();
260 ARCANE_FATAL(
"Can not read file '{0}' containing checkpoint/restart informations",
266 checkpoint_info.setNbSubDomain(nb_checkpoint_sub_domain);
269 checkpoint_info.setNbReplication(nb_checkpoint_replication);
273 if (service_name.
null()) {
275 "The file '{0}}' doesn't have "
276 "the name of the protection/restore service used "
277 "(attribute /checkpoint-info/service/@name)",
280 checkpoint_info.setServiceName(service_name);
281 String service_directory = service_elem.
attrValue(
"directory");
282 checkpoint_info.setDirectory(service_directory);
286 XmlNode last_index_attr = times_node.
attr(
"last-index");
287 if (last_index_attr.null())
288 ARCANE_THROW(IOException,
"missing attribute 'last-index'");
290 XmlNode last_time_attr = times_node.attr(
"last-time");
291 if (last_time_attr.null())
292 ARCANE_THROW(IOException,
"missing attribute 'last-time'");
294 Real last_time = last_time_attr.valueAsReal();
295 checkpoint_info.setCheckpointTime(last_time);
297 Integer last_index = last_index_attr.valueAsInteger();
298 checkpoint_info.setCheckpointIndex(last_index);
300 XmlNode meta_data_node = service_elem.
child(
"meta-data");
301 if (meta_data_node.null())
303 checkpoint_info.setReaderMetaData(meta_data_node.value());
305 return checkpoint_info;
314 _readCheckpoint(checkpoint_info);
323 String service_name = checkpoint_info.serviceName();
324 if (service_name.
null())
327 String service_directory = checkpoint_info.directory();
343 info() <<
"Using the checkpoint/restart service"
344 <<
" <" << service_name <<
"> (implement ICheckpointReader2)";
345 if (has_changing_sub_domain)
346 _applyNbSubDomainChange(checkpoint_info, s.get());
350 cri.setParallelMng(pm);
351 cri.setReader(s.get());
352 _readCheckpoint(cri);
359 if (has_changing_sub_domain)
360 ARCANE_FATAL(
"The number of sub-domains/replica in this run is different "
361 "from the number in checkpoint but the service specified "
362 "for checkpoint {0} does not handle this case",
365 ServiceFinder2T<ICheckpointReader, ISubDomain> sf2(app, m_sub_domain);
366 Ref<ICheckpointReader> checkpoint_reader(sf2.createReference(service_name));
368 if (!checkpoint_reader.get()) {
369 ARCANE_FATAL(
"The service specified for checkpoint/restart ({0}) is not available",
373 info() <<
"Using the checkpoint/restart service <" << service_name <<
">";
374 Real last_time = checkpoint_info.checkpointTime();
375 Int32 last_index = checkpoint_info.checkpointIndex();
377 checkpoint_reader->setCurrentTimeAndIndex(last_time, last_index);
379 String meta_data = checkpoint_info.readerMetaData();
380 checkpoint_reader->setReaderMetaData(meta_data);
381 checkpoint_reader->setBaseDirectoryName(service_directory);
404 if (m_sub_domain->allReplicaParallelMng()->isMasterIO()) {
405 Directory export_directory(m_sub_domain->exportDirectory());
406 String info_file(export_directory.
file(
"checkpoint_info.xml"));
407 std::ofstream ofile(info_file.
localstr());
421 m_sub_domain->variableMng()->writeCheckpoint(writer);
444 XmlNode doc = info_document->documentNode();
450 root.
setAttrValue(
"nb-sub-domain", String::fromNumber(nb_rank));
451 root.
setAttrValue(
"nb-replication", String::fromNumber(nb_replica));
459 XmlElement meta_data_elem(service_info,
"meta-data", reader_meta_data);
460 XmlElement checkpoints_time_elem(root,
"times");
462 XmlNode info_root = info_document->documentNode().documentElement();
467 if (nb_checkpoint > 0) {
468 checkpoints_time_elem.
setAttrValue(
"last-time", String::fromNumber(checkpoints_time[nb_checkpoint - 1]));
469 checkpoints_time_elem.
setAttrValue(
"last-index", String::fromNumber(nb_checkpoint - 1));
472 checkpoints_time_elem.
clear();
473 for (
Integer i = 0, is = checkpoints_time.
size(); i < is; ++i) {
474 XmlElement elem(checkpoints_time_elem,
"time");
475 elem.
setAttrValue(
"value", String::fromNumber(checkpoints_time[i]));
478 info_document->save(infos);
493 Int32 nb_checkpoint_sub_domain = ci.nbSubDomain();
494 Int32 nb_checkpoint_replication = ci.nbReplication();
499 if (nb_checkpoint_sub_domain < 1 || nb_checkpoint_replication < 1) {
500 info() <<
"Invalid or missing partitionning info in checkpoint.";
503 info() <<
"Reading checkpoint nb_sub_domain=" << nb_checkpoint_sub_domain
504 <<
" nb_replication=" << nb_checkpoint_replication;
505 MeshPartInfo current_part_info(makeMeshPartInfoFromParallelMng(m_sub_domain->parallelMng()));
507 Int32 nb_rank = current_part_info.nbPart();
508 Int32 nb_replication = current_part_info.nbReplication();
509 bool has_different_sub_domain =
false;
510 if (nb_rank != nb_checkpoint_sub_domain) {
511 has_different_sub_domain =
true;
514 if (nb_replication != nb_checkpoint_replication) {
516 "Bad number of replication ({0} in checkpoint, {1} in this run)",
517 nb_checkpoint_replication, nb_replication);
519 return has_different_sub_domain;
528 Int32 mesh_rank =
mesh->meshPartInfo().partRank();
531 const ItemGroup& all_items = family->allItems();
538 family->notifyItemsOwnerChanged();
548 ISubDomain* sd1 = m_sub_domain;
549 IApplication* app = sd1->application();
550 IParallelMng* pm = sd1->parallelMng();
552 Int32 my_rank = pm->commRank();
553 Int32 nb_rank = pm->commSize();
554 if (nb_rank > nb_old_rank)
555 ARCANE_THROW(NotImplementedException,
"Increasing number of sub-domains (old={0} new={1})",
556 nb_old_rank, nb_rank);
557 UniqueArray<Int32> old_ranks_to_new_ranks(nb_old_rank);
558 UniqueArray<Int32> ranks_to_read;
559 for (
Integer i = 0; i < nb_old_rank; ++i) {
560 Int32 new_rank = i % nb_rank;
561 old_ranks_to_new_ranks[i] = new_rank;
562 if (new_rank == my_rank)
563 ranks_to_read.add(i);
565 info() <<
"OLD_RANKS_TO_NEW_RANKS=" << old_ranks_to_new_ranks;
566 info() <<
"RANKS_TO_READ=" << ranks_to_read;
567 info() <<
"Apply Changing nb sub domain my_rank=" << my_rank;
568 String service_name = ci.serviceName();
571 IParallelMng* pm2 = pm->sequentialParallelMng();
572 UniqueArray<ISubDomain*> sd_to_merge_list2;
573 UniqueArray<Byte> case_bytes;
574 sd1->fillCaseBytes(case_bytes);
576 String message_passing_service =
"SequentialParallelMngContainerFactory";
577 ServiceBuilder<IParallelMngContainerFactory> sf(app);
578 auto pbf = sf.createReference(message_passing_service,
SB_AllowNull);
580 ARCANE_FATAL(
"Can not find service '{0}' implementing IParallelMngContainerFactory", message_passing_service);
581 Ref<IParallelMngContainer> parallel_builder(pbf->_createParallelMngBuilder(1, pm2->communicator(), pm2->machineCommunicator()));
583 for (
Int32 i : ranks_to_read) {
584 info() <<
"Reading Part sub_domain index=" << i;
585 info() <<
"Using the checkpoint/restart service"
586 <<
" <" << service_name <<
"> (implement ICheckpointReader2)";
587 CheckpointInfo checkpoint_info2(ci);
588 checkpoint_info2.setSubDomainRank(i);
589 CheckpointReadInfo cri(checkpoint_info2);
590 cri.setReader(reader);
591 cri.setParallelMng(pm2);
592 bool is_first = (i == my_rank);
593 ISubDomain* sd2 =
nullptr;
598 String file_suffix = String::format(
"s_{0}_{1}", my_rank, i);
599 ITraceMng* tm = app->createAndInitializeTraceMng(sd1->traceMng(), file_suffix);
600 Ref<IParallelMng> sub_pm = parallel_builder->_createParallelMng(0, tm);
601 SubDomainBuildInfo sdbi(sub_pm, i);
602 sdbi.setCaseFileName(sd1->caseFullFileName());
603 sdbi.setCaseBytes(case_bytes);
611 sd2 = arcaneCreateSubDomain(sd1->session(), sdbi);
613 sd2->readCaseMeshes();
615 sd_to_merge_list2.add(sd2);
616 sd2->setIsContinue();
617 sd2->allocateMeshes();
619 sd2->variableMng()->readCheckpoint(cri);
620 sd2->checkpointMng()->readObservable()->notifyAllObservers();
629 VariableCollection vars = sd2->variableMng()->variables();
630 for (VariableCollection::Enumerator ivar(vars); ++ivar;) {
631 IVariable* var = *ivar;
637 if (var->itemFamilyName().null())
640 info() <<
"LIST_VAR name=" << var->fullName();
643 UniqueArray<IMesh*> meshes_to_merge;
644 for (ISubDomain* sd_to_merge : sd_to_merge_list2) {
645 meshes_to_merge.add(sd_to_merge->defaultMesh());
650 _changeItemsOwner(sd1->defaultMesh(), old_ranks_to_new_ranks);
651 for (IMesh* mesh : meshes_to_merge)
652 _changeItemsOwner(mesh, old_ranks_to_new_ranks);
655 IMesh* mesh = sd1->defaultMesh();
660 MeshPartInfo p(makeMeshPartInfoFromParallelMng(mesh->parallelMng()));
661 mesh->toPrimaryMesh()->setMeshPartInfo(p);
#define ARCANE_THROW(exception_class,...)
Macro for throwing an exception with formatting.
#define ARCANE_FATAL(...)
Macro throwing a FatalErrorException.
Integer size() const
Number of elements in the vector.
const T * unguardedBasePointer() const
Information about a checkpoint.
IObservable * m_write_observable
Observable for writing.
void readDefaultCheckpoint() override
Reads a default checkpoint.
IObservable * readObservable() override
Read observable.
void writeCheckpoint(ICheckpointWriter *writer) override
Writes a default checkpoint using the writer.
void _writeCheckpointInfoFile(ICheckpointWriter *checkpoint_writer, ByteArray &infos)
CheckpointInfo readCheckpointInfo(Span< const Byte > infos, const String &buf_name) override
Reads checkpoint information.
IObservable * writeObservable() override
Write observable.
void writeDefaultCheckpoint(ICheckpointWriter *writer) override
Writes a checkpoint using the writer.
bool _checkChangingNbSubDomain(const CheckpointInfo &ci)
Checks if the number of sub-domains has changed between the checkpoint and the current allocation.
void readCheckpoint() override
Reads a checkpoint.
CheckpointInfo readDefaultCheckpointInfo() override
Reads default checkpoint information.
IObservable * m_read_observable
Observable for reading.
Checkpoint reading information.
constexpr Integer size() const noexcept
Number of elements in the array.
Class managing a directory.
String file(const String &file_name) const override
Returns the full path of the file file_name in the directory.
virtual ITraceMng * traceMng() const =0
Trace manager.
virtual IRessourceMng * ressourceMng() const =0
Resource manager.
Interface of the checkpoint information manager.
Interface for the protection/recovery reading service (V2).
Interface for the protection/recovery reading service.
Interface of the checkpoint/recovery write service.
virtual ConstArrayView< Real > checkpointTimes() const =0
Checkpoint times.
virtual String readerServiceName() const =0
Name of the reader service associated with this writer.
virtual String readerMetaData() const =0
Metadata for the reader associated with this writer.
virtual String baseDirectoryName() const =0
Name of the checkpoint base directory.
Interface of the input/output manager.
virtual bool collectiveRead(const String &filename, ByteArray &bytes)=0
Collective reading of a file.
Interface of an entity family.
virtual void mergeMeshes(ConstArrayView< IMesh * > meshes)=0
Merges the meshes of meshes with the current mesh.
virtual IMeshModifier * modifier()=0
Associated modifier interface.
Exception when an input/output error is detected.
Interface of an observable.
virtual void notifyAllObservers()=0
Notifies all observers.
Interface of the parallelism manager for a subdomain.
virtual Int32 commRank() const =0
Rank of this instance in the communicator.
virtual IParallelReplication * replication() const =0
Replication information.
virtual Int32 commSize() const =0
Number of instances in the communicator.
virtual ARCANE_DEPRECATED Integer nbSubDomain() const final
Total number of subdomains.
Brief information on parallel subdomain replication.
virtual Int32 replicationRank() const =0
Rank in the replication (from 0 to nbReplication()-1).
virtual Int32 nbReplication() const =0
Number of replications.
Interface of a resource manager.
virtual IXmlDocumentHolder * createXmlDocument()=0
Creates an XML document node.
Interface for a case execution session.
Interface of the subdomain manager.
virtual IApplication * application()=0
Application.
virtual IParallelMng * parallelMng()=0
Returns the parallelism manager.
virtual IVariableMng * variableMng()=0
Returns the variable manager.
virtual void readCheckpoint(ICheckpointReader *reader)=0
Reads all variables from a checkpoint.
@ PNoDump
Indicates that the variable should not be saved.
static IXmlDocumentHolder * loadFromBuffer(Span< const Byte > buffer, const String &name, ITraceMng *tm)
Loads an XML document.
Base class for a mesh element.
impl::MutableItemBase mutableItemBase() const
Mutable internal part of the entity.
Int32 owner() const
Owner subdomain number of the entity.
Information about a partitioned mesh.
void setOwner(Integer suid, Int32 current_sub_domain)
Sets the sub-domain number of the entity owner.
Exception when a 'parallel' fatal error is generated.
Reference to an instance.
Encapsulation of an automatically destructing pointer.
Utility class for instantiating a service of a given interface.
View of an array of elements of type T.
Unicode character string.
bool null() const
Returns true if the string is null.
const char * localstr() const
Returns the conversion of the instance into UTF-8 encoding.
Parameters necessary for building a subdomain.
TraceAccessor(ITraceMng *m)
Constructs an accessor via the trace manager m.
TraceMessage info() const
Flow for an information message.
XmlNode attr(const String &name, bool throw_exception=false) const
Returns the attribute of name name.
XmlNode documentElement() const
Returns the document element.
String attrValue(const String &name, bool throw_exception=false) const
Value of attribute name.
XmlNode child(const String &name) const
Child node of this node with name name.
void clear()
Deletes all child nodes.
bool null() const
True if the node is null.
Integer valueAsInteger(bool throw_exception=false) const
Node value converted to integer.
void setAttrValue(const String &name, const String &value)
Sets the attribute name to the value value.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
@ SB_AllowNull
Allows the service to be absent.
Int32 Integer
Type representing an integer.
Array< Byte > ByteArray
Dynamic one-dimensional array of characters.
ConstArrayView< Int32 > Int32ConstArrayView
C equivalent of a 1D array of 32-bit integers.
UniqueArray< Byte > ByteUniqueArray
Dynamic 1D array of characters.
double Real
Type representing a real number.
ConstArrayView< Byte > ByteConstArrayView
C equivalent of a 1D array of characters.
std::int32_t Int32
Signed integer type of 32 bits.
ConstArrayView< Real > RealConstArrayView
C equivalent of a 1D array of reals.