Arcane  v4.1.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
DistributedMatrix.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* DistributedMatrix.h (C) 2000-2026 */
9/* */
10/* Distributed Matrix using message passing. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ALINA_MPI_DISTRIBUTED_MATRIX_H
13#define ARCCORE_ALINA_MPI_DISTRIBUTED_MATRIX_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16/*
17 * This file is based on the work on AMGCL library (version march 2026)
18 * which can be found at https://github.com/ddemidov/amgcl.
19 *
20 * Copyright (c) 2012-2022 Denis Demidov <dennis.demidov@gmail.com>
21 * SPDX-License-Identifier: MIT
22 */
23/*---------------------------------------------------------------------------*/
24/*---------------------------------------------------------------------------*/
25
26#include "arccore/alina/AlinaUtils.h"
27#include "arccore/alina/MessagePassingUtils.h"
28
29#include "arccore/accelerator/Atomic.h"
30
31#include <vector>
32#include <algorithm>
33
34#include <memory>
35#include <unordered_map>
36#include <random>
37
38#include <mpi.h>
39
40/*---------------------------------------------------------------------------*/
41/*---------------------------------------------------------------------------*/
42
43namespace Arcane::Alina
44{
45
46/*---------------------------------------------------------------------------*/
47/*---------------------------------------------------------------------------*/
51template <class Backend>
52class CommunicationPattern
53{
54 public:
55
56 typedef typename Backend::value_type value_type;
57 typedef typename math::rhs_of<value_type>::type rhs_type;
58 typedef typename math::scalar_of<value_type>::type scalar_type;
59 typedef typename Backend::matrix matrix;
60 typedef typename Backend::vector vector;
61 typedef typename Backend::params backend_params;
62 typedef typename Backend::col_type col_type;
63 typedef typename Backend::ptr_type ptr_type;
64
65 struct
66 {
67 std::vector<ptrdiff_t> nbr;
68 std::vector<ptr_type> ptr;
69 std::vector<col_type> col;
70
71 size_t count() const
72 {
73 return col.size();
74 }
75
76 mutable std::vector<rhs_type> val;
78 } send;
79
80 struct
81 {
82 std::vector<ptrdiff_t> nbr;
83 std::vector<ptr_type> ptr;
84
85 size_t count() const
86 {
87 return val.size();
88 }
89
90 mutable std::vector<rhs_type> val;
92 } recv;
93
94 std::shared_ptr<vector> x_rem;
95
96 CommunicationPattern(mpi_communicator comm,
97 ptrdiff_t n_loc_cols,
98 size_t n_rem_cols, const col_type* p_rem_cols)
99 : comm(comm)
100 , loc_cols(n_loc_cols)
101 {
102 ARCCORE_ALINA_TIC("communication pattern");
103 // Get domain boundaries
104 std::vector<ptrdiff_t> domain = comm.exclusive_sum(n_loc_cols);
105 loc_beg = domain[comm.rank];
106
107 // Renumber remote columns,
108 // find out how many remote values we need from each process.
109 std::vector<col_type> rem_cols(p_rem_cols, p_rem_cols + n_rem_cols);
110
111 std::sort(rem_cols.begin(), rem_cols.end());
112 rem_cols.erase(std::unique(rem_cols.begin(), rem_cols.end()), rem_cols.end());
113
114 ptrdiff_t ncols = rem_cols.size();
115 ptrdiff_t rnbr = 0, snbr = 0, send_size = 0;
116
117 {
118 std::vector<int> rcounts(comm.size, 0);
119 std::vector<int> scounts(comm.size);
120
121 // Build index for column renumbering;
122 // count how many domains send us data and how much.
123 idx.reserve(2 * ncols);
124 for (int i = 0, d = 0, last = -1; i < ncols; ++i) {
125 while (rem_cols[i] >= domain[d + 1])
126 ++d;
127
128 ++rcounts[d];
129
130 if (last < d) {
131 last = d;
132 ++rnbr;
133 }
134
135 idx.insert(idx.end(), std::make_pair(rem_cols[i], std::make_tuple(rnbr - 1, i)));
136 }
137
138 recv.val.resize(ncols);
139 recv.req.resize(rnbr);
140
141 recv.nbr.reserve(rnbr);
142 recv.ptr.reserve(rnbr + 1);
143 recv.ptr.push_back(0);
144
145 for (int d = 0; d < comm.size; ++d) {
146 if (rcounts[d]) {
147 recv.nbr.push_back(d);
148 recv.ptr.push_back(recv.ptr.back() + rcounts[d]);
149 }
150 }
151
152 MPI_Alltoall(rcounts.data(), 1, MPI_INT, scounts.data(), 1, MPI_INT, comm);
153
154 for (ptrdiff_t d = 0; d < comm.size; ++d) {
155 if (scounts[d]) {
156 ++snbr;
157 send_size += scounts[d];
158 }
159 }
160
161 send.col.resize(send_size);
162 send.val.resize(send_size);
163 send.req.resize(snbr);
164
165 send.nbr.reserve(snbr);
166 send.ptr.reserve(snbr + 1);
167 send.ptr.push_back(0);
168
169 for (ptrdiff_t d = 0; d < comm.size; ++d) {
170 if (scounts[d]) {
171 send.nbr.push_back(d);
172 send.ptr.push_back(send.ptr.back() + scounts[d]);
173 }
174 }
175 }
176
177 // What columns do you need from me?
178 for (size_t i = 0; i < send.nbr.size(); ++i)
179 send.req[i] = comm.doIReceive(&send.col[send.ptr[i]], send.ptr[i + 1] - send.ptr[i],
180 send.nbr[i], tag_exc_cols);
181
182 // Here is what I need from you:
183 for (size_t i = 0; i < recv.nbr.size(); ++i)
184 recv.req[i] = comm.doISend(&rem_cols[recv.ptr[i]], recv.ptr[i + 1] - recv.ptr[i],
185 recv.nbr[i], tag_exc_cols);
186
187 ARCCORE_ALINA_TIC("MPI Wait");
188 comm.waitAll(recv.req);
189 comm.waitAll(send.req);
190 ARCCORE_ALINA_TOC("MPI Wait");
191
192 // Shift columns to send to local numbering:
193 for (col_type& c : send.col)
194 c -= loc_beg;
195
196 ARCCORE_ALINA_TOC("communication pattern");
197 }
198
199 template <class OtherBackend>
200 CommunicationPattern(const CommunicationPattern<OtherBackend>& C)
201 : comm(C.comm)
202 , idx(C.idx)
203 , loc_beg(C.loc_beg)
204 , loc_cols(C.loc_cols)
205 {
206 send.nbr = C.send.nbr;
207 send.ptr = C.send.ptr;
208 send.col = C.send.col;
209 send.val.resize(C.send.val.size());
210 send.req.resize(C.send.req.size());
211
212 recv.nbr = C.recv.nbr;
213 recv.ptr = C.recv.ptr;
214 recv.val.resize(C.recv.val.size());
215 recv.req.resize(C.recv.req.size());
216 }
217
218 void move_to_backend(const backend_params& bprm = backend_params())
219 {
220 if (!x_rem) {
221 x_rem = Backend::create_vector(recv.count(), bprm);
222 }
223
224 if (!gather) {
225 gather = std::make_shared<Gather>(loc_cols, send.col, bprm);
226 }
227 }
228
229 int domain(ptrdiff_t col) const
230 {
231 return std::get<0>(idx.at(col));
232 }
233
234 int local_index(ptrdiff_t col) const
235 {
236 return std::get<1>(idx.at(col));
237 }
238
239 std::tuple<int, int> remote_info(ptrdiff_t col) const
240 {
241 return idx.at(col);
242 }
243
244 std::unordered_map<ptrdiff_t, std::tuple<int, int>>::const_iterator
245 remote_begin() const
246 {
247 return idx.cbegin();
248 }
249
250 std::unordered_map<ptrdiff_t, std::tuple<int, int>>::const_iterator
251 remote_end() const
252 {
253 return idx.cend();
254 }
255
256 size_t renumber(size_t n, col_type* col) const
257 {
258 for (size_t i = 0; i < n; ++i)
259 col[i] = std::get<1>(idx.at(col[i]));
260 return recv.count();
261 }
262
263 bool needs_remote() const
264 {
265 return !recv.val.empty();
266 }
267
268 template <class Vector>
269 void start_exchange(const Vector& x) const
270 {
271 // Start receiving ghost values from our neighbours.
272 for (size_t i = 0; i < recv.nbr.size(); ++i)
273 recv.req[i] = comm.doIReceive(&recv.val[recv.ptr[i]], recv.ptr[i + 1] - recv.ptr[i],
274 recv.nbr[i], tag_exc_vals);
275
276 // Start sending our data to neighbours.
277 if (!send.val.empty()) {
278 (*gather)(x, send.val);
279
280 for (size_t i = 0; i < send.nbr.size(); ++i)
281 send.req[i] = comm.doISend(&send.val[send.ptr[i]], send.ptr[i + 1] - send.ptr[i],
282 send.nbr[i], tag_exc_vals);
283 }
284 }
285
286 void finish_exchange() const
287 {
288 ARCCORE_ALINA_TIC("MPI Wait");
289 comm.waitAll(recv.req);
290 comm.waitAll(send.req);
291 ARCCORE_ALINA_TOC("MPI Wait");
292
293 if (!recv.val.empty())
294 backend::copy(recv.val, *x_rem);
295 }
296
297 template <typename T>
298 void exchange(const T* send_val, T* recv_val) const
299 {
300 for (size_t i = 0; i < recv.nbr.size(); ++i)
301 recv.req[i] = comm.doIReceive(&recv_val[recv.ptr[i]], recv.ptr[i + 1] - recv.ptr[i],
302 recv.nbr[i], tag_exc_vals);
303
304 for (size_t i = 0; i < send.nbr.size(); ++i)
305 send.req[i] = comm.doISend(const_cast<T*>(&send_val[send.ptr[i]]), send.ptr[i + 1] - send.ptr[i],
306 send.nbr[i], tag_exc_vals);
307
308 ARCCORE_ALINA_TIC("MPI Wait");
309 comm.waitAll(recv.req);
310 comm.waitAll(send.req);
311 ARCCORE_ALINA_TOC("MPI Wait");
312 }
313
314 mpi_communicator mpi_comm() const
315 {
316 return comm;
317 }
318
319 ptrdiff_t loc_col_shift() const
320 {
321 return loc_beg;
322 }
323
324 private:
325
326 using Gather = Backend::gather;
327
328 static const int tag_set_comm = 1001;
329 static const int tag_exc_cols = 1002;
330 static const int tag_exc_vals = 1003;
331
332 mpi_communicator comm;
333
334 std::unordered_map<ptrdiff_t, std::tuple<int, int>> idx;
335 std::shared_ptr<Gather> gather;
336 ptrdiff_t loc_beg;
337 ptrdiff_t loc_cols;
338
339 template <class B>
340 friend class CommunicationPattern;
341};
342
343/*---------------------------------------------------------------------------*/
344/*---------------------------------------------------------------------------*/
348template <class Backend>
349class DistributedMatrix
350{
351 public:
352
353 typedef typename Backend::value_type value_type;
354 typedef typename math::rhs_of<value_type>::type rhs_type;
355 typedef typename math::scalar_of<value_type>::type scalar_type;
356 typedef typename Backend::params backend_params;
357 typedef typename Backend::matrix matrix;
358 typedef CommunicationPattern<Backend> CommPattern;
359 typedef typename Backend::matrix build_matrix;
360
361 DistributedMatrix(mpi_communicator comm,
362 std::shared_ptr<build_matrix> a_loc,
363 std::shared_ptr<build_matrix> a_rem,
364 std::shared_ptr<CommPattern> c = std::shared_ptr<CommPattern>())
365 : a_loc(a_loc)
366 , a_rem(a_rem)
367 {
368 if (c) {
369 C = c;
370 }
371 else {
372 C = std::make_shared<CommPattern>(comm, a_loc->ncols, a_rem->nbNonZero(), a_rem->col);
373 }
374
375 a_rem->ncols = C->recv.count();
376
377 n_loc_rows = a_loc->nbRow();
378 n_loc_cols = a_loc->ncols;
379 n_loc_nonzeros = a_loc->nbNonZero() + a_rem->nbNonZero();
380
381 n_glob_rows = comm.reduceSum(n_loc_rows);
382 n_glob_cols = comm.reduceSum(n_loc_cols);
383 n_glob_nonzeros = comm.reduceSum(n_loc_nonzeros);
384 }
385
386 // Copy the distributed_matrix from another backend
387 template <class OtherBackend>
388 DistributedMatrix(const DistributedMatrix<OtherBackend>& A)
389 : a_loc(std::make_shared<build_matrix>(*A.local()))
390 , a_rem(std::make_shared<build_matrix>(*A.remote()))
391 {
392 C = std::make_shared<CommPattern>(A.cpat());
393
394 this->a_rem->ncols = C->recv.count();
395
396 n_loc_rows = A.loc_rows();
397 n_loc_cols = A.loc_cols();
398 n_loc_nonzeros = A.loc_nonzeros();
399 n_glob_rows = A.glob_rows();
400 n_glob_cols = A.glob_cols();
401 n_glob_nonzeros = A.glob_nonzeros();
402 }
403
404 template <class Matrix>
405 DistributedMatrix(mpi_communicator comm,
406 const Matrix& A,
407 ptrdiff_t _n_loc_cols = -1)
408 : n_loc_rows(backend::nbRow(A))
409 , n_loc_cols(_n_loc_cols < 0 ? n_loc_rows : _n_loc_cols)
410 , n_loc_nonzeros(backend::nonzeros(A))
411 {
412 // Get sizes of each domain in comm.
413 std::vector<ptrdiff_t> domain = comm.exclusive_sum(n_loc_cols);
414 ptrdiff_t loc_beg = domain[comm.rank];
415 ptrdiff_t loc_end = domain[comm.rank + 1];
416
417 n_glob_cols = domain.back();
418 n_glob_rows = comm.reduceSum(n_loc_rows);
419 n_glob_nonzeros = comm.reduceSum(n_loc_nonzeros);
420
421 // Split the matrix into local and remote parts.
422 a_loc = std::make_shared<build_matrix>();
423 a_rem = std::make_shared<build_matrix>();
424
425 build_matrix& A_loc = *a_loc;
426 build_matrix& A_rem = *a_rem;
427
428 A_loc.set_size(n_loc_rows, n_loc_cols, true);
429 A_rem.set_size(n_loc_rows, 0, true);
430
431 arccoreParallelFor(0, n_loc_rows, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
432 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
433 for (auto a = backend::row_begin(A, i); a; ++a) {
434 ptrdiff_t c = a.col();
435
436 if (loc_beg <= c && c < loc_end)
437 ++A_loc.ptr[i + 1];
438 else
439 ++A_rem.ptr[i + 1];
440 }
441 }
442 });
443
444 A_loc.set_nonzeros(A_loc.scan_row_sizes());
445 A_rem.set_nonzeros(A_rem.scan_row_sizes());
446
447 arccoreParallelFor(0, n_loc_rows, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
448 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
449 ptrdiff_t loc_head = A_loc.ptr[i];
450 ptrdiff_t rem_head = A_rem.ptr[i];
451
452 for (auto a = backend::row_begin(A, i); a; ++a) {
453 ptrdiff_t c = a.col();
454 value_type v = a.value();
455
456 if (loc_beg <= c && c < loc_end) {
457 A_loc.col[loc_head] = c - loc_beg;
458 A_loc.val[loc_head] = v;
459 ++loc_head;
460 }
461 else {
462 A_rem.col[rem_head] = c;
463 A_rem.val[rem_head] = v;
464 ++rem_head;
465 }
466 }
467 }
468 });
469
470 C = std::make_shared<CommPattern>(comm, n_loc_cols, a_rem->nbNonZero(), a_rem->col);
471 a_rem->ncols = C->recv.count();
472 }
473
474 mpi_communicator comm() const
475 {
476 return C->mpi_comm();
477 }
478
479 std::shared_ptr<build_matrix> local() const
480 {
481 return a_loc;
482 }
483
484 std::shared_ptr<build_matrix> remote() const
485 {
486 return a_rem;
487 }
488
489 std::shared_ptr<matrix> local_backend() const
490 {
491 return A_loc;
492 }
493
494 std::shared_ptr<matrix> remote_backend() const
495 {
496 return A_rem;
497 }
498
499 ptrdiff_t loc_rows() const
500 {
501 return n_loc_rows;
502 }
503
504 ptrdiff_t loc_cols() const
505 {
506 return n_loc_cols;
507 }
508
509 ptrdiff_t loc_col_shift() const
510 {
511 return C->loc_col_shift();
512 }
513
514 ptrdiff_t loc_nonzeros() const
515 {
516 return n_loc_nonzeros;
517 }
518
519 ptrdiff_t glob_rows() const
520 {
521 return n_glob_rows;
522 }
523
524 ptrdiff_t glob_cols() const
525 {
526 return n_glob_cols;
527 }
528
529 ptrdiff_t glob_nonzeros() const
530 {
531 return n_glob_nonzeros;
532 }
533
534 const CommunicationPattern<Backend>& cpat() const
535 {
536 return *C;
537 }
538
539 void set_local(std::shared_ptr<matrix> a)
540 {
541 A_loc = a;
542 }
543
544 void move_to_backend(const backend_params& bprm = backend_params(), bool keep_src = false)
545 {
546 ARCCORE_ALINA_TIC("move to backend");
547 if (!A_loc) {
548 A_loc = Backend::copy_matrix(a_loc, bprm);
549 }
550
551 if (!A_rem && a_rem && a_rem->nbNonZero() > 0) {
552 if (keep_src) {
553 auto rem_copy = std::make_shared<build_matrix>(*a_rem);
554 C->renumber(rem_copy->nbNonZero(), rem_copy->col);
555 A_rem = Backend::copy_matrix(rem_copy, bprm);
556 }
557 else {
558 C->renumber(a_rem->nbNonZero(), a_rem->col);
559 A_rem = Backend::copy_matrix(a_rem, bprm);
560 }
561 }
562
563 C->move_to_backend(bprm);
564
565 if (!keep_src) {
566 a_loc.reset();
567 a_rem.reset();
568 }
569 ARCCORE_ALINA_TOC("move to backend");
570 }
571
572 template <class A, class VecX, class B, class VecY>
573 void mul(A alpha, const VecX& x, B beta, VecY& y) const
574 {
575 const auto one = math::identity<scalar_type>();
576
577 C->start_exchange(x);
578
579 // Compute local part of the product.
580 backend::spmv(alpha, *A_loc, x, beta, y);
581
582 // Compute remote part of the product.
583 C->finish_exchange();
584
585 if (C->needs_remote())
586 backend::spmv(alpha, *A_rem, *C->x_rem, one, y);
587 }
588
589 template <class Vec1, class Vec2, class Vec3>
590 void residual(const Vec1& f, const Vec2& x, Vec3& r) const
591 {
592 const auto one = math::identity<scalar_type>();
593
594 C->start_exchange(x);
595 backend::residual(f, *A_loc, x, r);
596
597 C->finish_exchange();
598
599 if (C->needs_remote())
600 backend::spmv(-one, *A_rem, *C->x_rem, one, r);
601 }
602
603 private:
604
605 std::shared_ptr<CommPattern> C;
606 std::shared_ptr<matrix> A_loc, A_rem;
607 std::shared_ptr<build_matrix> a_loc, a_rem;
608
609 ptrdiff_t n_loc_rows, n_glob_rows;
610 ptrdiff_t n_loc_cols, n_glob_cols;
611 ptrdiff_t n_loc_nonzeros, n_glob_nonzeros;
612};
613
614/*---------------------------------------------------------------------------*/
615/*---------------------------------------------------------------------------*/
616
617template <class Backend>
618std::shared_ptr<DistributedMatrix<Backend>>
619transpose(const DistributedMatrix<Backend>& A)
620{
621 ARCCORE_ALINA_TIC("MPI Transpose");
622 typedef typename Backend::value_type value_type;
623 typedef CommunicationPattern<Backend> CommPattern;
624 typedef typename Backend::matrix build_matrix;
625 typedef typename Backend::col_type col_type;
626
627 static const int tag_cnt = 2001;
628 static const int tag_col = 2002;
629 static const int tag_val = 2003;
630
631 mpi_communicator comm = A.comm();
632 const CommPattern& C = A.cpat();
633
634 build_matrix& A_loc = *A.local();
635 build_matrix& A_rem = *A.remote();
636
637 ptrdiff_t nrows = A_loc.ncols;
638 ptrdiff_t ncols = A_loc.nbRow();
639
640 UniqueArray<MessagePassing::Request> recv_cnt_req(C.send.req.size());
641 UniqueArray<MessagePassing::Request> recv_col_req(C.send.req.size());
642 UniqueArray<MessagePassing::Request> recv_val_req(C.send.req.size());
643
644 UniqueArray<MessagePassing::Request> send_cnt_req(C.recv.req.size());
645 UniqueArray<MessagePassing::Request> send_col_req(C.recv.req.size());
646 UniqueArray<MessagePassing::Request> send_val_req(C.recv.req.size());
647
648 // Our transposed remote part becomes remote part of someone else,
649 // and the other way around.
650 std::shared_ptr<build_matrix> t_ptr;
651 {
652 std::vector<col_type> tmp_col(A_rem.col.data(), A_rem.col.data() + A_rem.nbNonZero());
653 C.renumber(tmp_col.size(), tmp_col.data());
654
655 col_type* a_rem_col = tmp_col.data();
656 col_type* a_rem_col_backup = A_rem.col.data();
657 A_rem.col.setPointerZeroCopy(a_rem_col);
658
659 //std::swap(a_rem_col, A_rem.col);
660
661 t_ptr = transpose(A_rem);
662
663 A_rem.col.setPointerZeroCopy(a_rem_col_backup);
664 //std::swap(a_rem_col, A_rem.col);
665 }
666 build_matrix& t_rem = *t_ptr;
667
668 // Shift to global numbering:
669 std::vector<ptrdiff_t> domain = comm.exclusive_sum(ncols);
670 ptrdiff_t loc_beg = domain[comm.rank];
671 for (size_t i = 0; i < t_rem.nbNonZero(); ++i)
672 t_rem.col[i] += loc_beg;
673
674 // Shift from row pointers to row sizes:
675 std::vector<ptrdiff_t> row_size(t_rem.nbRow());
676 for (size_t i = 0; i < t_rem.nbRow(); ++i)
677 row_size[i] = t_rem.ptr[i + 1] - t_rem.ptr[i];
678
679 // Sizes of transposed remote blocks:
680 // 1. Exchange rem_ptr
681 std::vector<ptrdiff_t> rem_ptr(C.send.count() + 1);
682 rem_ptr[0] = 0;
683
684 for (size_t i = 0; i < C.send.nbr.size(); ++i) {
685 ptrdiff_t beg = C.send.ptr[i];
686 ptrdiff_t end = C.send.ptr[i + 1];
687
688 recv_cnt_req[i] = comm.doIReceive(&rem_ptr[beg + 1], end - beg, C.send.nbr[i], tag_cnt);
689 }
690
691 for (size_t i = 0; i < C.recv.nbr.size(); ++i) {
692 ptrdiff_t beg = C.recv.ptr[i];
693 ptrdiff_t end = C.recv.ptr[i + 1];
694
695 send_cnt_req[i] = comm.doISend(&row_size[beg], end - beg, C.recv.nbr[i], tag_cnt);
696 }
697
698 ARCCORE_ALINA_TIC("MPI Wait");
699 comm.waitAll(recv_cnt_req);
700 ARCCORE_ALINA_TOC("MPI Wait");
701 std::partial_sum(rem_ptr.begin(), rem_ptr.end(), rem_ptr.begin());
702
703 // 2. Start exchange of rem_col, rem_val
704 std::vector<col_type> rem_col(rem_ptr.back());
705 std::vector<value_type> rem_val(rem_ptr.back());
706
707 for (size_t i = 0; i < C.send.nbr.size(); ++i) {
708 ptrdiff_t rbeg = C.send.ptr[i];
709 ptrdiff_t rend = C.send.ptr[i + 1];
710
711 ptrdiff_t cbeg = rem_ptr[rbeg];
712 ptrdiff_t cend = rem_ptr[rend];
713
714 recv_col_req[i] = comm.doIReceive(&rem_col[cbeg], cend - cbeg, C.send.nbr[i], tag_col);
715 recv_val_req[i] = comm.doIReceive(&rem_val[cbeg], cend - cbeg, C.send.nbr[i], tag_val);
716 }
717
718 for (size_t i = 0; i < C.recv.nbr.size(); ++i) {
719 ptrdiff_t rbeg = C.recv.ptr[i];
720 ptrdiff_t rend = C.recv.ptr[i + 1];
721
722 ptrdiff_t cbeg = t_rem.ptr[rbeg];
723 ptrdiff_t cend = t_rem.ptr[rend];
724
725 send_col_req[i] = comm.doISend(&t_rem.col[cbeg], cend - cbeg, C.recv.nbr[i], tag_col);
726 send_val_req[i] = comm.doISend(&t_rem.val[cbeg], cend - cbeg, C.recv.nbr[i], tag_val);
727 }
728
729 // 3. While rem_col and rem_val are in flight,
730 // start constructing our remote part:
731 auto T_ptr = std::make_shared<build_matrix>();
732 build_matrix& T_rem = *T_ptr;
733 T_rem.set_size(nrows, 0, true);
734
735 for (size_t i = 0; i < C.send.count(); ++i)
736 T_rem.ptr[1 + C.send.col[i]] += rem_ptr[i + 1] - rem_ptr[i];
737
738 T_rem.scan_row_sizes();
739 T_rem.set_nonzeros();
740
741 // 4. Finish rem_col and rem_val exchange, and
742 // finish contruction of our remote part.
743 ARCCORE_ALINA_TIC("MPI Wait");
744 comm.waitAll(recv_col_req);
745 comm.waitAll(recv_val_req);
746 ARCCORE_ALINA_TOC("MPI Wait");
747
748 for (size_t i = 0; i < C.send.count(); ++i) {
749 ptrdiff_t row = C.send.col[i];
750 ptrdiff_t head = T_rem.ptr[row];
751
752 for (ptrdiff_t j = rem_ptr[i]; j < rem_ptr[i + 1]; ++j, ++head) {
753 T_rem.col[head] = rem_col[j];
754 T_rem.val[head] = rem_val[j];
755 }
756
757 T_rem.ptr[row] = head;
758 }
759
760 std::rotate(T_rem.ptr.data(), T_rem.ptr.data() + nrows, T_rem.ptr.data() + nrows + 1);
761 T_rem.ptr[0] = 0;
762
763 ARCCORE_ALINA_TIC("MPI Wait");
764 comm.waitAll(send_cnt_req);
765 comm.waitAll(send_col_req);
766 comm.waitAll(send_val_req);
767 ARCCORE_ALINA_TOC("MPI Wait");
768
769 ARCCORE_ALINA_TOC("MPI Transpose");
770
771 return std::make_shared<DistributedMatrix<Backend>>(comm, transpose(A_loc), T_ptr);
772}
773
774/*---------------------------------------------------------------------------*/
775/*---------------------------------------------------------------------------*/
776
777template <class Backend>
778std::shared_ptr<typename Backend::matrix>
779remote_rows(const CommunicationPattern<Backend>& C,
781 bool need_values = true)
782{
783 typedef typename Backend::matrix build_matrix;
784
785 static const int tag_ptr = 3001;
786 static const int tag_col = 3002;
787 static const int tag_val = 3003;
788
789 ARCCORE_ALINA_TIC("remote_rows");
790 mpi_communicator comm = C.mpi_comm();
791
792 build_matrix& B_loc = *B.local();
793 build_matrix& B_rem = *B.remote();
794 ptrdiff_t B_beg = B.loc_col_shift();
795
796 size_t nrecv = C.recv.nbr.size();
797 size_t nsend = C.send.nbr.size();
798
799 // Create blocked matrix to send to each domain
800 // that needs data from us:
801 UniqueArray<MessagePassing::Request> send_ptr_req(nsend);
802 UniqueArray<MessagePassing::Request> send_col_req(nsend);
803 UniqueArray<MessagePassing::Request> send_val_req(nsend);
804
805 std::vector<build_matrix> send_rows(nsend);
806
807 for (size_t k = 0; k < nsend; ++k) {
808 ptrdiff_t beg = C.send.ptr[k];
809 ptrdiff_t end = C.send.ptr[k + 1];
810
811 build_matrix& m = send_rows[k];
812 m.set_size(end - beg, 0, false);
813
814 size_t nnz = 0;
815 for (ptrdiff_t i = 0, ii = beg; ii < end; ++i, ++ii) {
816 ptrdiff_t r = C.send.col[ii];
817
818 ptrdiff_t w = (B_loc.ptr[r + 1] - B_loc.ptr[r]) + (B_rem.ptr[r + 1] - B_rem.ptr[r]);
819
820 m.ptr[i] = w;
821 nnz += w;
822 }
823 m.setNbNonZero(nnz);
824
825 send_ptr_req[k] = comm.doISend(m.ptr.data(), m.nbRow(), C.send.nbr[k], tag_ptr);
826
827 m.set_nonzeros(nnz, need_values);
828
829 for (ptrdiff_t i = 0, ii = beg, head = 0; ii < end; ++i, ++ii) {
830 ptrdiff_t r = C.send.col[ii];
831
832 // Contribution of the local part:
833 for (ptrdiff_t j = B_loc.ptr[r]; j < B_loc.ptr[r + 1]; ++j) {
834 m.col[head] = B_loc.col[j] + B_beg;
835
836 if (need_values)
837 m.val[head] = B_loc.val[j];
838
839 ++head;
840 }
841
842 // Contribution of the remote part:
843 for (ptrdiff_t j = B_rem.ptr[r]; j < B_rem.ptr[r + 1]; ++j) {
844 m.col[head] = B_rem.col[j];
845
846 if (need_values)
847 m.val[head] = B_rem.val[j];
848
849 ++head;
850 }
851 }
852
853 send_col_req[k] = comm.doISend(m.col.data(), m.nbNonZero(), C.send.nbr[k], tag_col);
854 if (need_values)
855 send_val_req[k] = comm.doISend(m.val.data(), m.nbNonZero(), C.send.nbr[k], tag_val);
856 }
857
858 // Receive rows of B in block format from our neighbors:
859 UniqueArray<MessagePassing::Request> recv_ptr_req(nrecv);
860 UniqueArray<MessagePassing::Request> recv_col_req(nrecv);
861 UniqueArray<MessagePassing::Request> recv_val_req(nrecv);
862
863 auto B_nbr = std::make_shared<build_matrix>();
864 B_nbr->set_size(C.recv.count(), 0, false);
865 B_nbr->ptr[0] = 0;
866
867 for (size_t k = 0; k < nrecv; ++k) {
868 ptrdiff_t beg = C.recv.ptr[k];
869 ptrdiff_t end = C.recv.ptr[k + 1];
870
871 recv_ptr_req[k] = comm.doIReceive(&B_nbr->ptr[beg + 1], end - beg, C.recv.nbr[k], tag_ptr);
872 }
873
874 ARCCORE_ALINA_TIC("MPI Wait");
875 comm.waitAll(recv_ptr_req);
876 ARCCORE_ALINA_TOC("MPI Wait");
877
878 B_nbr->set_nonzeros(B_nbr->scan_row_sizes(), need_values);
879
880 for (size_t k = 0; k < nrecv; ++k) {
881 ptrdiff_t rbeg = C.recv.ptr[k];
882 ptrdiff_t rend = C.recv.ptr[k + 1];
883
884 ptrdiff_t cbeg = B_nbr->ptr[rbeg];
885 ptrdiff_t cend = B_nbr->ptr[rend];
886
887 recv_col_req[k] = comm.doIReceive(&B_nbr->col[cbeg], cend - cbeg, C.recv.nbr[k], tag_col);
888
889 if (need_values)
890 recv_val_req[k] = comm.doIReceive(&B_nbr->val[cbeg], cend - cbeg, C.recv.nbr[k], tag_val);
891 }
892
893 ARCCORE_ALINA_TIC("MPI Wait");
894 comm.waitAll(send_ptr_req);
895 comm.waitAll(send_col_req);
896 comm.waitAll(recv_col_req);
897
898 if (need_values) {
899 comm.waitAll(send_val_req);
900 comm.waitAll(recv_val_req);
901 }
902 ARCCORE_ALINA_TOC("MPI Wait");
903
904 ARCCORE_ALINA_TOC("remote_rows");
905 return B_nbr;
906}
907
908/*---------------------------------------------------------------------------*/
909/*---------------------------------------------------------------------------*/
910
911template <class Backend>
912std::shared_ptr<DistributedMatrix<Backend>>
914{
915 typedef typename Backend::value_type value_type;
916 using build_matrix = Backend::matrix;
917 typedef typename Backend::col_type col_type;
918 ARCCORE_ALINA_TIC("product");
919
920 const CommunicationPattern<Backend>& Acp = A.cpat();
921
922 build_matrix& A_loc = *A.local();
923 build_matrix& A_rem = *A.remote();
924 build_matrix& B_loc = *B.local();
925 build_matrix& B_rem = *B.remote();
926
927 ptrdiff_t A_rows = A.loc_rows();
928 ptrdiff_t B_cols = B.loc_cols();
929
930 ptrdiff_t B_beg = B.loc_col_shift();
931 ptrdiff_t B_end = B_beg + B_cols;
932
933 auto b_nbr = remote_rows(Acp, B);
934 build_matrix& B_nbr = *b_nbr;
935
936 // Build mapping from global to local column numbers in the remote part of
937 // the product matrix.
938 std::vector<col_type> rem_cols(B_rem.nbNonZero() + B_nbr.nbNonZero());
939
940 std::copy(B_nbr.col.data(), B_nbr.col.data() + B_nbr.nbNonZero(),
941 std::copy(B_rem.col.data(), B_rem.col.data() + B_rem.nbNonZero(), rem_cols.begin()));
942
943 std::sort(rem_cols.begin(), rem_cols.end());
944 rem_cols.erase(std::unique(rem_cols.begin(), rem_cols.end()), rem_cols.end());
945
946 ptrdiff_t n_rem_cols = 0;
947 std::unordered_map<ptrdiff_t, int> rem_idx(2 * rem_cols.size());
948 for (ptrdiff_t c : rem_cols) {
949 if (c >= B_beg && c < B_end)
950 continue;
951 rem_idx[c] = n_rem_cols++;
952 }
953
954 // Build the product.
955 auto c_loc = std::make_shared<build_matrix>();
956 auto c_rem = std::make_shared<build_matrix>();
957
958 build_matrix& C_loc = *c_loc;
959 build_matrix& C_rem = *c_rem;
960
961 C_loc.set_size(A_rows, B_cols, false);
962 C_rem.set_size(A_rows, 0, false);
963
964 C_loc.ptr[0] = 0;
965 C_rem.ptr[0] = 0;
966
967 ARCCORE_ALINA_TIC("analyze");
968 arccoreParallelFor(0, A_rows, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
969 std::vector<ptrdiff_t> loc_marker(B_end - B_beg, -1);
970 std::vector<ptrdiff_t> rem_marker(n_rem_cols, -1);
971
972 for (ptrdiff_t ia = begin; ia < (begin + size); ++ia) {
973 ptrdiff_t loc_cols = 0;
974 ptrdiff_t rem_cols = 0;
975
976 for (ptrdiff_t ja = A_loc.ptr[ia], ea = A_loc.ptr[ia + 1]; ja < ea; ++ja) {
977 ptrdiff_t ca = A_loc.col[ja];
978
979 for (ptrdiff_t jb = B_loc.ptr[ca], eb = B_loc.ptr[ca + 1]; jb < eb; ++jb) {
980 ptrdiff_t cb = B_loc.col[jb];
981
982 if (loc_marker[cb] != ia) {
983 loc_marker[cb] = ia;
984 ++loc_cols;
985 }
986 }
987
988 for (ptrdiff_t jb = B_rem.ptr[ca], eb = B_rem.ptr[ca + 1]; jb < eb; ++jb) {
989 ptrdiff_t cb = rem_idx[B_rem.col[jb]];
990
991 if (rem_marker[cb] != ia) {
992 rem_marker[cb] = ia;
993 ++rem_cols;
994 }
995 }
996 }
997
998 for (ptrdiff_t ja = A_rem.ptr[ia], ea = A_rem.ptr[ia + 1]; ja < ea; ++ja) {
999 ptrdiff_t ca = Acp.local_index(A_rem.col[ja]);
1000
1001 for (ptrdiff_t jb = B_nbr.ptr[ca], eb = B_nbr.ptr[ca + 1]; jb < eb; ++jb) {
1002 ptrdiff_t cb = B_nbr.col[jb];
1003
1004 if (cb >= B_beg && cb < B_end) {
1005 cb -= B_beg;
1006
1007 if (loc_marker[cb] != ia) {
1008 loc_marker[cb] = ia;
1009 ++loc_cols;
1010 }
1011 }
1012 else {
1013 cb = rem_idx[cb];
1014
1015 if (rem_marker[cb] != ia) {
1016 rem_marker[cb] = ia;
1017 ++rem_cols;
1018 }
1019 }
1020 }
1021 }
1022
1023 C_loc.ptr[ia + 1] = loc_cols;
1024 C_rem.ptr[ia + 1] = rem_cols;
1025 }
1026 });
1027 ARCCORE_ALINA_TOC("analyze");
1028
1029 C_loc.set_nonzeros(C_loc.scan_row_sizes());
1030 C_rem.set_nonzeros(C_rem.scan_row_sizes());
1031
1032 ARCCORE_ALINA_TIC("compute");
1033 arccoreParallelFor(0, A_rows, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1034 std::vector<ptrdiff_t> loc_marker(B_end - B_beg, -1);
1035 std::vector<ptrdiff_t> rem_marker(n_rem_cols, -1);
1036
1037 for (ptrdiff_t ia = begin; ia < (begin + size); ++ia) {
1038 ptrdiff_t loc_beg = C_loc.ptr[ia];
1039 ptrdiff_t rem_beg = C_rem.ptr[ia];
1040 ptrdiff_t loc_end = loc_beg;
1041 ptrdiff_t rem_end = rem_beg;
1042
1043 for (ptrdiff_t ja = A_loc.ptr[ia], ea = A_loc.ptr[ia + 1]; ja < ea; ++ja) {
1044 ptrdiff_t ca = A_loc.col[ja];
1045 value_type va = A_loc.val[ja];
1046
1047 for (ptrdiff_t jb = B_loc.ptr[ca], eb = B_loc.ptr[ca + 1]; jb < eb; ++jb) {
1048 ptrdiff_t cb = B_loc.col[jb];
1049 value_type vb = B_loc.val[jb];
1050
1051 if (loc_marker[cb] < loc_beg) {
1052 loc_marker[cb] = loc_end;
1053
1054 C_loc.col[loc_end] = cb;
1055 C_loc.val[loc_end] = va * vb;
1056
1057 ++loc_end;
1058 }
1059 else {
1060 C_loc.val[loc_marker[cb]] += va * vb;
1061 }
1062 }
1063
1064 for (ptrdiff_t jb = B_rem.ptr[ca], eb = B_rem.ptr[ca + 1]; jb < eb; ++jb) {
1065 ptrdiff_t gb = B_rem.col[jb];
1066 ptrdiff_t cb = rem_idx[gb];
1067 value_type vb = B_rem.val[jb];
1068
1069 if (rem_marker[cb] < rem_beg) {
1070 rem_marker[cb] = rem_end;
1071
1072 C_rem.col[rem_end] = gb;
1073 C_rem.val[rem_end] = va * vb;
1074
1075 ++rem_end;
1076 }
1077 else {
1078 C_rem.val[rem_marker[cb]] += va * vb;
1079 }
1080 }
1081 }
1082
1083 for (ptrdiff_t ja = A_rem.ptr[ia], ea = A_rem.ptr[ia + 1]; ja < ea; ++ja) {
1084 ptrdiff_t ca = Acp.local_index(A_rem.col[ja]);
1085 value_type va = A_rem.val[ja];
1086
1087 for (ptrdiff_t jb = B_nbr.ptr[ca], eb = B_nbr.ptr[ca + 1]; jb < eb; ++jb) {
1088 ptrdiff_t gb = B_nbr.col[jb];
1089 value_type vb = B_nbr.val[jb];
1090
1091 if (gb >= B_beg && gb < B_end) {
1092 ptrdiff_t cb = gb - B_beg;
1093
1094 if (loc_marker[cb] < loc_beg) {
1095 loc_marker[cb] = loc_end;
1096
1097 C_loc.col[loc_end] = cb;
1098 C_loc.val[loc_end] = va * vb;
1099
1100 ++loc_end;
1101 }
1102 else {
1103 C_loc.val[loc_marker[cb]] += va * vb;
1104 }
1105 }
1106 else {
1107 ptrdiff_t cb = rem_idx[gb];
1108
1109 if (rem_marker[cb] < rem_beg) {
1110 rem_marker[cb] = rem_end;
1111
1112 C_rem.col[rem_end] = gb;
1113 C_rem.val[rem_end] = va * vb;
1114
1115 ++rem_end;
1116 }
1117 else {
1118 C_rem.val[rem_marker[cb]] += va * vb;
1119 }
1120 }
1121 }
1122 }
1123 }
1124 });
1125 ARCCORE_ALINA_TOC("compute");
1126 ARCCORE_ALINA_TOC("product");
1127
1128 return std::make_shared<DistributedMatrix<Backend>>(A.comm(), c_loc, c_rem);
1129}
1130
1131/*---------------------------------------------------------------------------*/
1132/*---------------------------------------------------------------------------*/
1133
1134template <class Backend, class T>
1135void scale(DistributedMatrix<Backend>& A, T s)
1136{
1137 using build_matrix = Backend::matrix;
1138
1139 build_matrix& A_loc = *A.local();
1140 build_matrix& A_rem = *A.remote();
1141
1142 ptrdiff_t n = A_loc.nbRow();
1143
1144 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1145 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1146 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j)
1147 A_loc.val[j] *= s;
1148 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
1149 A_rem.val[j] *= s;
1150 }
1151 });
1152}
1153
1154/*---------------------------------------------------------------------------*/
1155/*---------------------------------------------------------------------------*/
1156
1157template <class Backend>
1158void sort_rows(DistributedMatrix<Backend>& A)
1159{
1160 sort_rows(*A.local());
1161 sort_rows(*A.remote());
1162}
1163
1164/*---------------------------------------------------------------------------*/
1165/*---------------------------------------------------------------------------*/
1166
1167} // namespace Arcane::Alina
1168
1169/*---------------------------------------------------------------------------*/
1170/*---------------------------------------------------------------------------*/
1171
1172namespace Arcane::Alina::backend
1173{
1174
1175/*---------------------------------------------------------------------------*/
1176/*---------------------------------------------------------------------------*/
1177
1178template <class Backend>
1180{
1181 static size_t get(const DistributedMatrix<Backend>& A)
1182 {
1183 return A.loc_rows();
1184 }
1185};
1186
1187template <class Backend, class Alpha, class Vec1, class Beta, class Vec2>
1188struct spmv_impl<Alpha, DistributedMatrix<Backend>, Vec1, Beta, Vec2>
1189{
1190 static void apply(Alpha alpha,
1192 const Vec1& x, Beta beta, Vec2& y)
1193 {
1194 A.mul(alpha, x, beta, y);
1195 }
1196};
1197
1198template <class Backend, class Vec1, class Vec2, class Vec3>
1199struct residual_impl<DistributedMatrix<Backend>, Vec1, Vec2, Vec3>
1200{
1201 static void apply(const Vec1& rhs,
1203 const Vec2& x, Vec3& r)
1204 {
1205 A.residual(rhs, x, r);
1206 }
1207};
1208
1209/*---------------------------------------------------------------------------*/
1210/*---------------------------------------------------------------------------*/
1211}
1212
1213/*---------------------------------------------------------------------------*/
1214/*---------------------------------------------------------------------------*/
1215
1216namespace Arcane::Alina
1217{
1218
1219/*---------------------------------------------------------------------------*/
1220/*---------------------------------------------------------------------------*/
1221
1222// Diagonal of the matrix
1223template <class Backend>
1224std::shared_ptr<numa_vector<typename Backend::value_type>>
1225diagonal(const DistributedMatrix<Backend>& A, bool invert = false)
1226{
1227 return diagonal(*A.local(), invert);
1228}
1229
1230/*---------------------------------------------------------------------------*/
1231/*---------------------------------------------------------------------------*/
1232
1233// Estimate spectral radius of the matrix.
1234template <bool scale, class Backend>
1235typename math::scalar_of<typename Backend::value_type>::type
1236spectral_radius(const DistributedMatrix<Backend>& A, int power_iters = 0)
1237{
1238 ARCCORE_ALINA_TIC("spectral radius");
1239 typedef typename Backend::value_type value_type;
1240 typedef typename math::rhs_of<value_type>::type rhs_type;
1241 typedef typename math::scalar_of<value_type>::type scalar_type;
1242 typedef CSRMatrix<value_type> build_matrix;
1243
1244 mpi_communicator comm = A.comm();
1245
1246 const build_matrix& A_loc = *A.local();
1247 const build_matrix& A_rem = *A.remote();
1248 const CommunicationPattern<Backend>& C = A.cpat();
1249
1250 const ptrdiff_t n = A_loc.nbRow();
1251 scalar_type radius = 0;
1252
1253 // TODO: Use the code in CSRMatrixOperation.h
1254 if (power_iters <= 0) {
1255 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1256 scalar_type emax = 0;
1257 value_type dia = math::identity<value_type>();
1258
1259 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1260 scalar_type s = 0;
1261
1262 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
1263 ptrdiff_t c = A_loc.col[j];
1264 value_type v = A_loc.val[j];
1265
1266 s += math::norm(v);
1267
1268 if (scale && c == i)
1269 dia = v;
1270 }
1271
1272 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
1273 s += math::norm(A_rem.val[j]);
1274
1275 if (scale)
1276 s *= math::norm(math::inverse(dia));
1277
1278 emax = std::max(emax, s);
1279 }
1280
1282 });
1283 }
1284 else {
1285 numa_vector<rhs_type> b0(n, false), b1(n, false);
1286 numa_vector<ptrdiff_t> rem_col(A_rem.nbNonZero(), false);
1287
1288 // Fill the initial vector with random values.
1289 // Also extract the inverted matrix diagonal values.
1290 std::atomic<scalar_type> atomic_b0_loc_norm = {};
1291
1292 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1293 const int tid = TaskFactory::currentTaskThreadIndex();
1294 const int nt = ConcurrencyBase::maxAllowedThread();
1295
1296 std::mt19937 rng(comm.size * nt + tid);
1297 std::uniform_real_distribution<scalar_type> rnd(-1, 1);
1298
1299 scalar_type t_norm = 0;
1300
1301 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1302 rhs_type v = math::constant<rhs_type>(rnd(rng));
1303
1304 b0[i] = v;
1305 t_norm += math::norm(math::inner_product(v, v));
1306
1307 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j) {
1308 rem_col[j] = C.local_index(A_rem.col[j]);
1309 }
1310 }
1311
1312 // GG: Not reproducible
1313 atomic_b0_loc_norm += t_norm;
1314 });
1315 scalar_type b0_loc_norm = atomic_b0_loc_norm;
1316 scalar_type b0_norm = comm.reduceSum(b0_loc_norm);
1317
1318 // Normalize b0
1319 b0_norm = 1 / sqrt(b0_norm);
1320 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1321 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1322 b0[i] = b0_norm * b0[i];
1323 }
1324 });
1325
1326 std::vector<rhs_type> b0_send(C.send.count());
1327 std::vector<rhs_type> b0_recv(C.recv.count());
1328
1329 for (size_t i = 0, m = C.send.count(); i < m; ++i)
1330 b0_send[i] = b0[C.send.col[i]];
1331 C.exchange(b0_send.data(), b0_recv.data());
1332
1333 for (int iter = 0; iter < power_iters;) {
1334 // b1 = (D * A) * b0
1335 // b1_norm = ||b1||
1336 // radius = <b1,b0>
1337
1338 std::atomic<scalar_type> atomic_b1_loc_norm = 0;
1339 std::atomic<scalar_type> atomic_loc_radius = 0;
1340
1341 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1342 scalar_type t_norm = 0;
1343 scalar_type t_radi = 0;
1344 value_type dia = math::identity<value_type>();
1345
1346 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1347 rhs_type s = math::zero<rhs_type>();
1348
1349 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
1350 ptrdiff_t c = A_loc.col[j];
1351 value_type v = A_loc.val[j];
1352 if (scale && c == i)
1353 dia = v;
1354 s += v * b0[c];
1355 }
1356
1357 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
1358 s += A_rem.val[j] * b0_recv[rem_col[j]];
1359
1360 if (scale)
1361 s = math::inverse(dia) * s;
1362
1363 t_norm += math::norm(math::inner_product(s, s));
1364 t_radi += math::norm(math::inner_product(s, b0[i]));
1365
1366 b1[i] = s;
1367 }
1368
1369 {
1370 // GG: Not reproducible
1371 atomic_b1_loc_norm += t_norm;
1372 atomic_loc_radius += t_radi;
1373 }
1374 });
1375 scalar_type b1_loc_norm = atomic_b1_loc_norm;
1376 scalar_type loc_radius = atomic_loc_radius;
1377
1378 radius = comm.reduceSum(loc_radius);
1379
1380 if (++iter < power_iters) {
1381 scalar_type b1_norm;
1382 b1_norm = comm.reduceSum(b1_loc_norm);
1383
1384 // b0 = b1 / b1_norm
1385 b1_norm = 1 / sqrt(b1_norm);
1386 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1387 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1388 b0[i] = b1_norm * b1[i];
1389 }
1390 });
1391
1392 for (size_t i = 0, m = C.send.count(); i < m; ++i)
1393 b0_send[i] = b0[C.send.col[i]];
1394 C.exchange(b0_send.data(), b0_recv.data());
1395 }
1396 }
1397 }
1398 ARCCORE_ALINA_TOC("spectral radius");
1399
1400 return radius < 0 ? static_cast<scalar_type>(2) : radius;
1401}
1402
1403/*---------------------------------------------------------------------------*/
1404/*---------------------------------------------------------------------------*/
1405
1406} // namespace Arcane::Alina
1407
1408/*---------------------------------------------------------------------------*/
1409/*---------------------------------------------------------------------------*/
1410
1411#endif
Call to handle communication pattern.
Distributed Matrix using message passing.
NUMA-aware vector container.
Definition NumaVector.h:42
static Int32 maxAllowedThread()
Nombre maximum de threads autorisés pour le multi-threading.
Informations d'exécution d'une boucle.
Matrix class, to be used by user.
static Int32 currentTaskThreadIndex()
Indice (entre 0 et nbAllowedThread()-1) du thread exécutant la tâche actuelle.
Vecteur 1D de données avec sémantique par valeur (style STL).
Vector class, to be used by user.
__host__ __device__ DataType doAtomic(DataType *ptr, ValueType value)
Applique l'opération atomique Operation à la valeur à l'adresse ptr avec la valeur value.
__host__ __device__ double sqrt(double v)
Racine carrée de v.
Definition Math.h:135
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applique en concurrence la fonction lambda lambda_function sur l'intervalle d'itération donné par loo...
Definition ParallelFor.h:85
std::int32_t Int32
Type entier signé sur 32 bits.
Implementation for residual error compuatation.
Implementation for function returning the number of rows in a matrix.
Implementation for matrix-vector product.
Convenience wrapper around MPI_Comm.
std::vector< T > exclusive_sum(T n) const
Exclusive sum over mpi communicator.