12#ifndef ARCCORE_ALINA_MPI_DISTRIBUTED_MATRIX_H
13#define ARCCORE_ALINA_MPI_DISTRIBUTED_MATRIX_H
26#include "arccore/alina/AlinaUtils.h"
27#include "arccore/alina/MessagePassingUtils.h"
29#include "arccore/accelerator/Atomic.h"
35#include <unordered_map>
43namespace Arcane::Alina
51template <
class Backend>
52class CommunicationPattern
56 typedef typename Backend::value_type value_type;
57 typedef typename math::rhs_of<value_type>::type rhs_type;
58 typedef typename math::scalar_of<value_type>::type scalar_type;
59 typedef typename Backend::matrix matrix;
60 typedef typename Backend::vector vector;
61 typedef typename Backend::params backend_params;
62 typedef typename Backend::col_type col_type;
63 typedef typename Backend::ptr_type ptr_type;
67 std::vector<ptrdiff_t> nbr;
68 std::vector<ptr_type> ptr;
69 std::vector<col_type> col;
76 mutable std::vector<rhs_type> val;
82 std::vector<ptrdiff_t> nbr;
83 std::vector<ptr_type> ptr;
90 mutable std::vector<rhs_type> val;
94 std::shared_ptr<vector> x_rem;
98 size_t n_rem_cols,
const col_type* p_rem_cols)
100 , loc_cols(n_loc_cols)
102 ARCCORE_ALINA_TIC(
"communication pattern");
104 std::vector<ptrdiff_t> domain = comm.exclusive_sum(n_loc_cols);
105 loc_beg = domain[comm.rank];
109 std::vector<col_type> rem_cols(p_rem_cols, p_rem_cols + n_rem_cols);
111 std::sort(rem_cols.begin(), rem_cols.end());
112 rem_cols.erase(std::unique(rem_cols.begin(), rem_cols.end()), rem_cols.end());
114 ptrdiff_t ncols = rem_cols.size();
115 ptrdiff_t rnbr = 0, snbr = 0, send_size = 0;
118 std::vector<int> rcounts(comm.size, 0);
119 std::vector<int> scounts(comm.size);
123 idx.reserve(2 * ncols);
124 for (
int i = 0, d = 0, last = -1; i < ncols; ++i) {
125 while (rem_cols[i] >= domain[d + 1])
135 idx.insert(idx.end(), std::make_pair(rem_cols[i], std::make_tuple(rnbr - 1, i)));
138 recv.val.resize(ncols);
139 recv.req.resize(rnbr);
141 recv.nbr.reserve(rnbr);
142 recv.ptr.reserve(rnbr + 1);
143 recv.ptr.push_back(0);
145 for (
int d = 0; d < comm.size; ++d) {
147 recv.nbr.push_back(d);
148 recv.ptr.push_back(recv.ptr.back() + rcounts[d]);
152 MPI_Alltoall(rcounts.data(), 1, MPI_INT, scounts.data(), 1, MPI_INT, comm);
154 for (ptrdiff_t d = 0; d < comm.size; ++d) {
157 send_size += scounts[d];
161 send.col.resize(send_size);
162 send.val.resize(send_size);
163 send.req.resize(snbr);
165 send.nbr.reserve(snbr);
166 send.ptr.reserve(snbr + 1);
167 send.ptr.push_back(0);
169 for (ptrdiff_t d = 0; d < comm.size; ++d) {
171 send.nbr.push_back(d);
172 send.ptr.push_back(send.ptr.back() + scounts[d]);
178 for (
size_t i = 0; i < send.nbr.size(); ++i)
179 send.req[i] = comm.doIReceive(&send.col[send.ptr[i]], send.ptr[i + 1] - send.ptr[i],
180 send.nbr[i], tag_exc_cols);
183 for (
size_t i = 0; i < recv.nbr.size(); ++i)
184 recv.req[i] = comm.doISend(&rem_cols[recv.ptr[i]], recv.ptr[i + 1] - recv.ptr[i],
185 recv.nbr[i], tag_exc_cols);
187 ARCCORE_ALINA_TIC(
"MPI Wait");
188 comm.waitAll(recv.req);
189 comm.waitAll(send.req);
190 ARCCORE_ALINA_TOC(
"MPI Wait");
193 for (col_type& c : send.col)
196 ARCCORE_ALINA_TOC(
"communication pattern");
199 template <
class OtherBackend>
200 CommunicationPattern(
const CommunicationPattern<OtherBackend>& C)
204 , loc_cols(C.loc_cols)
206 send.nbr = C.send.nbr;
207 send.ptr = C.send.ptr;
208 send.col = C.send.col;
209 send.val.resize(C.send.val.size());
210 send.req.resize(C.send.req.size());
212 recv.nbr = C.recv.nbr;
213 recv.ptr = C.recv.ptr;
214 recv.val.resize(C.recv.val.size());
215 recv.req.resize(C.recv.req.size());
218 void move_to_backend(
const backend_params& bprm = backend_params())
221 x_rem = Backend::create_vector(recv.count(), bprm);
225 gather = std::make_shared<Gather>(loc_cols, send.col, bprm);
229 int domain(ptrdiff_t col)
const
231 return std::get<0>(idx.at(col));
234 int local_index(ptrdiff_t col)
const
236 return std::get<1>(idx.at(col));
239 std::tuple<int, int> remote_info(ptrdiff_t col)
const
244 std::unordered_map<ptrdiff_t, std::tuple<int, int>>::const_iterator
250 std::unordered_map<ptrdiff_t, std::tuple<int, int>>::const_iterator
256 size_t renumber(
size_t n, col_type* col)
const
258 for (
size_t i = 0; i < n; ++i)
259 col[i] = std::get<1>(idx.at(col[i]));
263 bool needs_remote()
const
265 return !recv.val.empty();
268 template <
class Vector>
269 void start_exchange(
const Vector& x)
const
272 for (
size_t i = 0; i < recv.nbr.size(); ++i)
273 recv.req[i] = comm.doIReceive(&recv.val[recv.ptr[i]], recv.ptr[i + 1] - recv.ptr[i],
274 recv.nbr[i], tag_exc_vals);
277 if (!send.val.empty()) {
278 (*gather)(x, send.val);
280 for (
size_t i = 0; i < send.nbr.size(); ++i)
281 send.req[i] = comm.doISend(&send.val[send.ptr[i]], send.ptr[i + 1] - send.ptr[i],
282 send.nbr[i], tag_exc_vals);
286 void finish_exchange()
const
288 ARCCORE_ALINA_TIC(
"MPI Wait");
289 comm.waitAll(recv.req);
290 comm.waitAll(send.req);
291 ARCCORE_ALINA_TOC(
"MPI Wait");
293 if (!recv.val.empty())
294 backend::copy(recv.val, *x_rem);
297 template <
typename T>
298 void exchange(
const T* send_val, T* recv_val)
const
300 for (
size_t i = 0; i < recv.nbr.size(); ++i)
301 recv.req[i] = comm.doIReceive(&recv_val[recv.ptr[i]], recv.ptr[i + 1] - recv.ptr[i],
302 recv.nbr[i], tag_exc_vals);
304 for (
size_t i = 0; i < send.nbr.size(); ++i)
305 send.req[i] = comm.doISend(
const_cast<T*
>(&send_val[send.ptr[i]]), send.ptr[i + 1] - send.ptr[i],
306 send.nbr[i], tag_exc_vals);
308 ARCCORE_ALINA_TIC(
"MPI Wait");
309 comm.waitAll(recv.req);
310 comm.waitAll(send.req);
311 ARCCORE_ALINA_TOC(
"MPI Wait");
319 ptrdiff_t loc_col_shift()
const
326 using Gather = Backend::gather;
328 static const int tag_set_comm = 1001;
329 static const int tag_exc_cols = 1002;
330 static const int tag_exc_vals = 1003;
334 std::unordered_map<ptrdiff_t, std::tuple<int, int>> idx;
335 std::shared_ptr<Gather> gather;
340 friend class CommunicationPattern;
348template <
class Backend>
349class DistributedMatrix
353 typedef typename Backend::value_type value_type;
354 typedef typename math::rhs_of<value_type>::type rhs_type;
355 typedef typename math::scalar_of<value_type>::type scalar_type;
356 typedef typename Backend::params backend_params;
357 typedef typename Backend::matrix matrix;
359 typedef typename Backend::matrix build_matrix;
362 std::shared_ptr<build_matrix> a_loc,
363 std::shared_ptr<build_matrix> a_rem,
364 std::shared_ptr<CommPattern> c = std::shared_ptr<CommPattern>())
372 C = std::make_shared<CommPattern>(comm, a_loc->ncols, a_rem->nbNonZero(), a_rem->col);
375 a_rem->ncols = C->recv.count();
377 n_loc_rows = a_loc->nbRow();
378 n_loc_cols = a_loc->ncols;
379 n_loc_nonzeros = a_loc->nbNonZero() + a_rem->nbNonZero();
381 n_glob_rows = comm.reduceSum(n_loc_rows);
382 n_glob_cols = comm.reduceSum(n_loc_cols);
383 n_glob_nonzeros = comm.reduceSum(n_loc_nonzeros);
387 template <
class OtherBackend>
388 DistributedMatrix(
const DistributedMatrix<OtherBackend>& A)
389 : a_loc(std::make_shared<build_matrix>(*A.local()))
390 , a_rem(std::make_shared<build_matrix>(*A.remote()))
392 C = std::make_shared<CommPattern>(A.cpat());
394 this->a_rem->ncols = C->recv.count();
396 n_loc_rows = A.loc_rows();
397 n_loc_cols = A.loc_cols();
398 n_loc_nonzeros = A.loc_nonzeros();
399 n_glob_rows = A.glob_rows();
400 n_glob_cols = A.glob_cols();
401 n_glob_nonzeros = A.glob_nonzeros();
404 template <
class Matrix>
407 ptrdiff_t _n_loc_cols = -1)
408 : n_loc_rows(backend::nbRow(A))
409 , n_loc_cols(_n_loc_cols < 0 ? n_loc_rows : _n_loc_cols)
410 , n_loc_nonzeros(backend::nonzeros(A))
413 std::vector<ptrdiff_t> domain = comm.exclusive_sum(n_loc_cols);
414 ptrdiff_t loc_beg = domain[comm.rank];
415 ptrdiff_t loc_end = domain[comm.rank + 1];
417 n_glob_cols = domain.back();
418 n_glob_rows = comm.reduceSum(n_loc_rows);
419 n_glob_nonzeros = comm.reduceSum(n_loc_nonzeros);
422 a_loc = std::make_shared<build_matrix>();
423 a_rem = std::make_shared<build_matrix>();
425 build_matrix& A_loc = *a_loc;
426 build_matrix& A_rem = *a_rem;
428 A_loc.set_size(n_loc_rows, n_loc_cols,
true);
429 A_rem.set_size(n_loc_rows, 0,
true);
432 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
433 for (
auto a = backend::row_begin(A, i); a; ++a) {
434 ptrdiff_t c = a.col();
436 if (loc_beg <= c && c < loc_end)
444 A_loc.set_nonzeros(A_loc.scan_row_sizes());
445 A_rem.set_nonzeros(A_rem.scan_row_sizes());
448 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
449 ptrdiff_t loc_head = A_loc.ptr[i];
450 ptrdiff_t rem_head = A_rem.ptr[i];
452 for (
auto a = backend::row_begin(A, i); a; ++a) {
453 ptrdiff_t c = a.col();
454 value_type v = a.value();
456 if (loc_beg <= c && c < loc_end) {
457 A_loc.col[loc_head] = c - loc_beg;
458 A_loc.val[loc_head] = v;
462 A_rem.col[rem_head] = c;
463 A_rem.val[rem_head] = v;
470 C = std::make_shared<CommPattern>(comm, n_loc_cols, a_rem->nbNonZero(), a_rem->col);
471 a_rem->ncols = C->recv.count();
476 return C->mpi_comm();
479 std::shared_ptr<build_matrix> local()
const
484 std::shared_ptr<build_matrix> remote()
const
489 std::shared_ptr<matrix> local_backend()
const
494 std::shared_ptr<matrix> remote_backend()
const
499 ptrdiff_t loc_rows()
const
504 ptrdiff_t loc_cols()
const
509 ptrdiff_t loc_col_shift()
const
511 return C->loc_col_shift();
514 ptrdiff_t loc_nonzeros()
const
516 return n_loc_nonzeros;
519 ptrdiff_t glob_rows()
const
524 ptrdiff_t glob_cols()
const
529 ptrdiff_t glob_nonzeros()
const
531 return n_glob_nonzeros;
539 void set_local(std::shared_ptr<matrix> a)
544 void move_to_backend(
const backend_params& bprm = backend_params(),
bool keep_src =
false)
546 ARCCORE_ALINA_TIC(
"move to backend");
548 A_loc = Backend::copy_matrix(a_loc, bprm);
551 if (!A_rem && a_rem && a_rem->nbNonZero() > 0) {
553 auto rem_copy = std::make_shared<build_matrix>(*a_rem);
554 C->renumber(rem_copy->nbNonZero(), rem_copy->col);
555 A_rem = Backend::copy_matrix(rem_copy, bprm);
558 C->renumber(a_rem->nbNonZero(), a_rem->col);
559 A_rem = Backend::copy_matrix(a_rem, bprm);
563 C->move_to_backend(bprm);
569 ARCCORE_ALINA_TOC(
"move to backend");
572 template <
class A,
class VecX,
class B,
class VecY>
573 void mul(A alpha,
const VecX& x, B beta, VecY& y)
const
575 const auto one = math::identity<scalar_type>();
577 C->start_exchange(x);
580 backend::spmv(alpha, *A_loc, x, beta, y);
583 C->finish_exchange();
585 if (C->needs_remote())
586 backend::spmv(alpha, *A_rem, *C->x_rem, one, y);
589 template <
class Vec1,
class Vec2,
class Vec3>
590 void residual(
const Vec1& f,
const Vec2& x, Vec3& r)
const
592 const auto one = math::identity<scalar_type>();
594 C->start_exchange(x);
595 backend::residual(f, *A_loc, x, r);
597 C->finish_exchange();
599 if (C->needs_remote())
600 backend::spmv(-one, *A_rem, *C->x_rem, one, r);
605 std::shared_ptr<CommPattern> C;
606 std::shared_ptr<matrix> A_loc, A_rem;
607 std::shared_ptr<build_matrix> a_loc, a_rem;
609 ptrdiff_t n_loc_rows, n_glob_rows;
610 ptrdiff_t n_loc_cols, n_glob_cols;
611 ptrdiff_t n_loc_nonzeros, n_glob_nonzeros;
617template <
class Backend>
618std::shared_ptr<DistributedMatrix<Backend>>
621 ARCCORE_ALINA_TIC(
"MPI Transpose");
622 typedef typename Backend::value_type value_type;
624 typedef typename Backend::matrix build_matrix;
625 typedef typename Backend::col_type col_type;
627 static const int tag_cnt = 2001;
628 static const int tag_col = 2002;
629 static const int tag_val = 2003;
632 const CommPattern& C = A.cpat();
634 build_matrix& A_loc = *A.local();
635 build_matrix& A_rem = *A.remote();
637 ptrdiff_t nrows = A_loc.ncols;
638 ptrdiff_t ncols = A_loc.nbRow();
650 std::shared_ptr<build_matrix> t_ptr;
652 std::vector<col_type> tmp_col(A_rem.col.data(), A_rem.col.data() + A_rem.nbNonZero());
653 C.renumber(tmp_col.size(), tmp_col.data());
655 col_type* a_rem_col = tmp_col.data();
656 col_type* a_rem_col_backup = A_rem.col.data();
657 A_rem.col.setPointerZeroCopy(a_rem_col);
661 t_ptr = transpose(A_rem);
663 A_rem.col.setPointerZeroCopy(a_rem_col_backup);
666 build_matrix& t_rem = *t_ptr;
670 ptrdiff_t loc_beg = domain[comm.rank];
671 for (
size_t i = 0; i < t_rem.nbNonZero(); ++i)
672 t_rem.col[i] += loc_beg;
675 std::vector<ptrdiff_t> row_size(t_rem.nbRow());
676 for (
size_t i = 0; i < t_rem.nbRow(); ++i)
677 row_size[i] = t_rem.ptr[i + 1] - t_rem.ptr[i];
681 std::vector<ptrdiff_t> rem_ptr(C.send.count() + 1);
684 for (
size_t i = 0; i < C.send.nbr.size(); ++i) {
685 ptrdiff_t beg = C.send.ptr[i];
686 ptrdiff_t end = C.send.ptr[i + 1];
688 recv_cnt_req[i] = comm.doIReceive(&rem_ptr[beg + 1], end - beg, C.send.nbr[i], tag_cnt);
691 for (
size_t i = 0; i < C.recv.nbr.size(); ++i) {
692 ptrdiff_t beg = C.recv.ptr[i];
693 ptrdiff_t end = C.recv.ptr[i + 1];
695 send_cnt_req[i] = comm.doISend(&row_size[beg], end - beg, C.recv.nbr[i], tag_cnt);
698 ARCCORE_ALINA_TIC(
"MPI Wait");
699 comm.waitAll(recv_cnt_req);
700 ARCCORE_ALINA_TOC(
"MPI Wait");
701 std::partial_sum(rem_ptr.begin(), rem_ptr.end(), rem_ptr.begin());
704 std::vector<col_type> rem_col(rem_ptr.back());
705 std::vector<value_type> rem_val(rem_ptr.back());
707 for (
size_t i = 0; i < C.send.nbr.size(); ++i) {
708 ptrdiff_t rbeg = C.send.ptr[i];
709 ptrdiff_t rend = C.send.ptr[i + 1];
711 ptrdiff_t cbeg = rem_ptr[rbeg];
712 ptrdiff_t cend = rem_ptr[rend];
714 recv_col_req[i] = comm.doIReceive(&rem_col[cbeg], cend - cbeg, C.send.nbr[i], tag_col);
715 recv_val_req[i] = comm.doIReceive(&rem_val[cbeg], cend - cbeg, C.send.nbr[i], tag_val);
718 for (
size_t i = 0; i < C.recv.nbr.size(); ++i) {
719 ptrdiff_t rbeg = C.recv.ptr[i];
720 ptrdiff_t rend = C.recv.ptr[i + 1];
722 ptrdiff_t cbeg = t_rem.ptr[rbeg];
723 ptrdiff_t cend = t_rem.ptr[rend];
725 send_col_req[i] = comm.doISend(&t_rem.col[cbeg], cend - cbeg, C.recv.nbr[i], tag_col);
726 send_val_req[i] = comm.doISend(&t_rem.val[cbeg], cend - cbeg, C.recv.nbr[i], tag_val);
731 auto T_ptr = std::make_shared<build_matrix>();
732 build_matrix& T_rem = *T_ptr;
733 T_rem.set_size(nrows, 0,
true);
735 for (
size_t i = 0; i < C.send.count(); ++i)
736 T_rem.ptr[1 + C.send.col[i]] += rem_ptr[i + 1] - rem_ptr[i];
738 T_rem.scan_row_sizes();
739 T_rem.set_nonzeros();
743 ARCCORE_ALINA_TIC(
"MPI Wait");
744 comm.waitAll(recv_col_req);
745 comm.waitAll(recv_val_req);
746 ARCCORE_ALINA_TOC(
"MPI Wait");
748 for (
size_t i = 0; i < C.send.count(); ++i) {
749 ptrdiff_t row = C.send.col[i];
750 ptrdiff_t head = T_rem.ptr[row];
752 for (ptrdiff_t j = rem_ptr[i]; j < rem_ptr[i + 1]; ++j, ++head) {
753 T_rem.col[head] = rem_col[j];
754 T_rem.val[head] = rem_val[j];
757 T_rem.ptr[row] = head;
760 std::rotate(T_rem.ptr.data(), T_rem.ptr.data() + nrows, T_rem.ptr.data() + nrows + 1);
763 ARCCORE_ALINA_TIC(
"MPI Wait");
764 comm.waitAll(send_cnt_req);
765 comm.waitAll(send_col_req);
766 comm.waitAll(send_val_req);
767 ARCCORE_ALINA_TOC(
"MPI Wait");
769 ARCCORE_ALINA_TOC(
"MPI Transpose");
771 return std::make_shared<DistributedMatrix<Backend>>(comm, transpose(A_loc), T_ptr);
777template <
class Backend>
778std::shared_ptr<typename Backend::matrix>
781 bool need_values =
true)
783 typedef typename Backend::matrix build_matrix;
785 static const int tag_ptr = 3001;
786 static const int tag_col = 3002;
787 static const int tag_val = 3003;
789 ARCCORE_ALINA_TIC(
"remote_rows");
792 build_matrix& B_loc = *B.local();
793 build_matrix& B_rem = *B.remote();
794 ptrdiff_t B_beg = B.loc_col_shift();
796 size_t nrecv = C.recv.nbr.size();
797 size_t nsend = C.send.nbr.size();
801 UniqueArray<MessagePassing::Request> send_ptr_req(nsend);
802 UniqueArray<MessagePassing::Request> send_col_req(nsend);
803 UniqueArray<MessagePassing::Request> send_val_req(nsend);
805 std::vector<build_matrix> send_rows(nsend);
807 for (
size_t k = 0; k < nsend; ++k) {
808 ptrdiff_t beg = C.send.ptr[k];
809 ptrdiff_t end = C.send.ptr[k + 1];
811 build_matrix& m = send_rows[k];
812 m.set_size(end - beg, 0,
false);
815 for (ptrdiff_t i = 0, ii = beg; ii < end; ++i, ++ii) {
816 ptrdiff_t r = C.send.col[ii];
818 ptrdiff_t w = (B_loc.ptr[r + 1] - B_loc.ptr[r]) + (B_rem.ptr[r + 1] - B_rem.ptr[r]);
825 send_ptr_req[k] = comm.doISend(m.ptr.data(), m.nbRow(), C.send.nbr[k], tag_ptr);
827 m.set_nonzeros(nnz, need_values);
829 for (ptrdiff_t i = 0, ii = beg, head = 0; ii < end; ++i, ++ii) {
830 ptrdiff_t r = C.send.col[ii];
833 for (ptrdiff_t j = B_loc.ptr[r]; j < B_loc.ptr[r + 1]; ++j) {
834 m.col[head] = B_loc.col[j] + B_beg;
837 m.val[head] = B_loc.val[j];
843 for (ptrdiff_t j = B_rem.ptr[r]; j < B_rem.ptr[r + 1]; ++j) {
844 m.col[head] = B_rem.col[j];
847 m.val[head] = B_rem.val[j];
853 send_col_req[k] = comm.doISend(m.col.data(), m.nbNonZero(), C.send.nbr[k], tag_col);
855 send_val_req[k] = comm.doISend(m.val.data(), m.nbNonZero(), C.send.nbr[k], tag_val);
859 UniqueArray<MessagePassing::Request> recv_ptr_req(nrecv);
860 UniqueArray<MessagePassing::Request> recv_col_req(nrecv);
861 UniqueArray<MessagePassing::Request> recv_val_req(nrecv);
863 auto B_nbr = std::make_shared<build_matrix>();
864 B_nbr->set_size(C.recv.count(), 0,
false);
867 for (
size_t k = 0; k < nrecv; ++k) {
868 ptrdiff_t beg = C.recv.ptr[k];
869 ptrdiff_t end = C.recv.ptr[k + 1];
871 recv_ptr_req[k] = comm.doIReceive(&B_nbr->ptr[beg + 1], end - beg, C.recv.nbr[k], tag_ptr);
874 ARCCORE_ALINA_TIC(
"MPI Wait");
875 comm.waitAll(recv_ptr_req);
876 ARCCORE_ALINA_TOC(
"MPI Wait");
878 B_nbr->set_nonzeros(B_nbr->scan_row_sizes(), need_values);
880 for (
size_t k = 0; k < nrecv; ++k) {
881 ptrdiff_t rbeg = C.recv.ptr[k];
882 ptrdiff_t rend = C.recv.ptr[k + 1];
884 ptrdiff_t cbeg = B_nbr->ptr[rbeg];
885 ptrdiff_t cend = B_nbr->ptr[rend];
887 recv_col_req[k] = comm.doIReceive(&B_nbr->col[cbeg], cend - cbeg, C.recv.nbr[k], tag_col);
890 recv_val_req[k] = comm.doIReceive(&B_nbr->val[cbeg], cend - cbeg, C.recv.nbr[k], tag_val);
893 ARCCORE_ALINA_TIC(
"MPI Wait");
894 comm.waitAll(send_ptr_req);
895 comm.waitAll(send_col_req);
896 comm.waitAll(recv_col_req);
899 comm.waitAll(send_val_req);
900 comm.waitAll(recv_val_req);
902 ARCCORE_ALINA_TOC(
"MPI Wait");
904 ARCCORE_ALINA_TOC(
"remote_rows");
911template <
class Backend>
912std::shared_ptr<DistributedMatrix<Backend>>
915 typedef typename Backend::value_type value_type;
916 using build_matrix = Backend::matrix;
917 typedef typename Backend::col_type col_type;
918 ARCCORE_ALINA_TIC(
"product");
922 build_matrix& A_loc = *A.local();
923 build_matrix& A_rem = *A.remote();
924 build_matrix& B_loc = *B.local();
925 build_matrix& B_rem = *B.remote();
927 ptrdiff_t A_rows = A.loc_rows();
928 ptrdiff_t B_cols = B.loc_cols();
930 ptrdiff_t B_beg = B.loc_col_shift();
931 ptrdiff_t B_end = B_beg + B_cols;
933 auto b_nbr = remote_rows(Acp, B);
934 build_matrix& B_nbr = *b_nbr;
938 std::vector<col_type> rem_cols(B_rem.nbNonZero() + B_nbr.nbNonZero());
940 std::copy(B_nbr.col.data(), B_nbr.col.data() + B_nbr.nbNonZero(),
941 std::copy(B_rem.col.data(), B_rem.col.data() + B_rem.nbNonZero(), rem_cols.begin()));
943 std::sort(rem_cols.begin(), rem_cols.end());
944 rem_cols.erase(std::unique(rem_cols.begin(), rem_cols.end()), rem_cols.end());
946 ptrdiff_t n_rem_cols = 0;
947 std::unordered_map<ptrdiff_t, int> rem_idx(2 * rem_cols.size());
948 for (ptrdiff_t c : rem_cols) {
949 if (c >= B_beg && c < B_end)
951 rem_idx[c] = n_rem_cols++;
955 auto c_loc = std::make_shared<build_matrix>();
956 auto c_rem = std::make_shared<build_matrix>();
958 build_matrix& C_loc = *c_loc;
959 build_matrix& C_rem = *c_rem;
961 C_loc.set_size(A_rows, B_cols,
false);
962 C_rem.set_size(A_rows, 0,
false);
967 ARCCORE_ALINA_TIC(
"analyze");
969 std::vector<ptrdiff_t> loc_marker(B_end - B_beg, -1);
970 std::vector<ptrdiff_t> rem_marker(n_rem_cols, -1);
972 for (ptrdiff_t ia = begin; ia < (begin + size); ++ia) {
973 ptrdiff_t loc_cols = 0;
974 ptrdiff_t rem_cols = 0;
976 for (ptrdiff_t ja = A_loc.ptr[ia], ea = A_loc.ptr[ia + 1]; ja < ea; ++ja) {
977 ptrdiff_t ca = A_loc.col[ja];
979 for (ptrdiff_t jb = B_loc.ptr[ca], eb = B_loc.ptr[ca + 1]; jb < eb; ++jb) {
980 ptrdiff_t cb = B_loc.col[jb];
982 if (loc_marker[cb] != ia) {
988 for (ptrdiff_t jb = B_rem.ptr[ca], eb = B_rem.ptr[ca + 1]; jb < eb; ++jb) {
989 ptrdiff_t cb = rem_idx[B_rem.col[jb]];
991 if (rem_marker[cb] != ia) {
998 for (ptrdiff_t ja = A_rem.ptr[ia], ea = A_rem.ptr[ia + 1]; ja < ea; ++ja) {
999 ptrdiff_t ca = Acp.local_index(A_rem.col[ja]);
1001 for (ptrdiff_t jb = B_nbr.ptr[ca], eb = B_nbr.ptr[ca + 1]; jb < eb; ++jb) {
1002 ptrdiff_t cb = B_nbr.col[jb];
1004 if (cb >= B_beg && cb < B_end) {
1007 if (loc_marker[cb] != ia) {
1008 loc_marker[cb] = ia;
1015 if (rem_marker[cb] != ia) {
1016 rem_marker[cb] = ia;
1023 C_loc.ptr[ia + 1] = loc_cols;
1024 C_rem.ptr[ia + 1] = rem_cols;
1027 ARCCORE_ALINA_TOC(
"analyze");
1029 C_loc.set_nonzeros(C_loc.scan_row_sizes());
1030 C_rem.set_nonzeros(C_rem.scan_row_sizes());
1032 ARCCORE_ALINA_TIC(
"compute");
1034 std::vector<ptrdiff_t> loc_marker(B_end - B_beg, -1);
1035 std::vector<ptrdiff_t> rem_marker(n_rem_cols, -1);
1037 for (ptrdiff_t ia = begin; ia < (begin + size); ++ia) {
1038 ptrdiff_t loc_beg = C_loc.ptr[ia];
1039 ptrdiff_t rem_beg = C_rem.ptr[ia];
1040 ptrdiff_t loc_end = loc_beg;
1041 ptrdiff_t rem_end = rem_beg;
1043 for (ptrdiff_t ja = A_loc.ptr[ia], ea = A_loc.ptr[ia + 1]; ja < ea; ++ja) {
1044 ptrdiff_t ca = A_loc.col[ja];
1045 value_type va = A_loc.val[ja];
1047 for (ptrdiff_t jb = B_loc.ptr[ca], eb = B_loc.ptr[ca + 1]; jb < eb; ++jb) {
1048 ptrdiff_t cb = B_loc.col[jb];
1049 value_type vb = B_loc.val[jb];
1051 if (loc_marker[cb] < loc_beg) {
1052 loc_marker[cb] = loc_end;
1054 C_loc.col[loc_end] = cb;
1055 C_loc.val[loc_end] = va * vb;
1060 C_loc.val[loc_marker[cb]] += va * vb;
1064 for (ptrdiff_t jb = B_rem.ptr[ca], eb = B_rem.ptr[ca + 1]; jb < eb; ++jb) {
1065 ptrdiff_t gb = B_rem.col[jb];
1066 ptrdiff_t cb = rem_idx[gb];
1067 value_type vb = B_rem.val[jb];
1069 if (rem_marker[cb] < rem_beg) {
1070 rem_marker[cb] = rem_end;
1072 C_rem.col[rem_end] = gb;
1073 C_rem.val[rem_end] = va * vb;
1078 C_rem.val[rem_marker[cb]] += va * vb;
1083 for (ptrdiff_t ja = A_rem.ptr[ia], ea = A_rem.ptr[ia + 1]; ja < ea; ++ja) {
1084 ptrdiff_t ca = Acp.local_index(A_rem.col[ja]);
1085 value_type va = A_rem.val[ja];
1087 for (ptrdiff_t jb = B_nbr.ptr[ca], eb = B_nbr.ptr[ca + 1]; jb < eb; ++jb) {
1088 ptrdiff_t gb = B_nbr.col[jb];
1089 value_type vb = B_nbr.val[jb];
1091 if (gb >= B_beg && gb < B_end) {
1092 ptrdiff_t cb = gb - B_beg;
1094 if (loc_marker[cb] < loc_beg) {
1095 loc_marker[cb] = loc_end;
1097 C_loc.col[loc_end] = cb;
1098 C_loc.val[loc_end] = va * vb;
1103 C_loc.val[loc_marker[cb]] += va * vb;
1107 ptrdiff_t cb = rem_idx[gb];
1109 if (rem_marker[cb] < rem_beg) {
1110 rem_marker[cb] = rem_end;
1112 C_rem.col[rem_end] = gb;
1113 C_rem.val[rem_end] = va * vb;
1118 C_rem.val[rem_marker[cb]] += va * vb;
1125 ARCCORE_ALINA_TOC(
"compute");
1126 ARCCORE_ALINA_TOC(
"product");
1128 return std::make_shared<DistributedMatrix<Backend>>(A.comm(), c_loc, c_rem);
1134template <
class Backend,
class T>
1137 using build_matrix = Backend::matrix;
1139 build_matrix& A_loc = *A.local();
1140 build_matrix& A_rem = *A.remote();
1142 ptrdiff_t n = A_loc.nbRow();
1145 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1146 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j)
1148 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
1157template <
class Backend>
1160 sort_rows(*A.local());
1161 sort_rows(*A.remote());
1172namespace Arcane::Alina::backend
1178template <
class Backend>
1183 return A.loc_rows();
1187template <
class Backend,
class Alpha,
class Vec1,
class Beta,
class Vec2>
1190 static void apply(Alpha alpha,
1192 const Vec1& x, Beta beta, Vec2& y)
1194 A.mul(alpha, x, beta, y);
1198template <
class Backend,
class Vec1,
class Vec2,
class Vec3>
1201 static void apply(
const Vec1& rhs,
1203 const Vec2& x, Vec3& r)
1205 A.residual(rhs, x, r);
1216namespace Arcane::Alina
1223template <
class Backend>
1224std::shared_ptr<numa_vector<typename Backend::value_type>>
1227 return diagonal(*A.local(), invert);
1234template <
bool scale,
class Backend>
1235typename math::scalar_of<typename Backend::value_type>::type
1236spectral_radius(
const DistributedMatrix<Backend>& A,
int power_iters = 0)
1238 ARCCORE_ALINA_TIC(
"spectral radius");
1239 typedef typename Backend::value_type value_type;
1240 typedef typename math::rhs_of<value_type>::type rhs_type;
1241 typedef typename math::scalar_of<value_type>::type scalar_type;
1242 typedef CSRMatrix<value_type> build_matrix;
1244 mpi_communicator comm = A.comm();
1246 const build_matrix& A_loc = *A.local();
1247 const build_matrix& A_rem = *A.remote();
1248 const CommunicationPattern<Backend>& C = A.cpat();
1250 const ptrdiff_t n = A_loc.nbRow();
1251 scalar_type radius = 0;
1254 if (power_iters <= 0) {
1256 scalar_type emax = 0;
1257 value_type dia = math::identity<value_type>();
1259 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1262 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
1263 ptrdiff_t c = A_loc.col[j];
1264 value_type v = A_loc.val[j];
1268 if (scale && c == i)
1272 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
1273 s += math::norm(A_rem.val[j]);
1276 s *= math::norm(math::inverse(dia));
1278 emax = std::max(emax, s);
1290 std::atomic<scalar_type> atomic_b0_loc_norm = {};
1296 std::mt19937 rng(comm.size * nt + tid);
1297 std::uniform_real_distribution<scalar_type> rnd(-1, 1);
1299 scalar_type t_norm = 0;
1301 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1302 rhs_type v = math::constant<rhs_type>(rnd(rng));
1305 t_norm += math::norm(math::inner_product(v, v));
1307 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j) {
1308 rem_col[j] = C.local_index(A_rem.col[j]);
1313 atomic_b0_loc_norm += t_norm;
1315 scalar_type b0_loc_norm = atomic_b0_loc_norm;
1316 scalar_type b0_norm = comm.reduceSum(b0_loc_norm);
1319 b0_norm = 1 /
sqrt(b0_norm);
1321 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1322 b0[i] = b0_norm * b0[i];
1326 std::vector<rhs_type> b0_send(C.send.count());
1327 std::vector<rhs_type> b0_recv(C.recv.count());
1329 for (
size_t i = 0, m = C.send.count(); i < m; ++i)
1330 b0_send[i] = b0[C.send.col[i]];
1331 C.exchange(b0_send.data(), b0_recv.data());
1333 for (
int iter = 0; iter < power_iters;) {
1338 std::atomic<scalar_type> atomic_b1_loc_norm = 0;
1339 std::atomic<scalar_type> atomic_loc_radius = 0;
1342 scalar_type t_norm = 0;
1343 scalar_type t_radi = 0;
1344 value_type dia = math::identity<value_type>();
1346 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1347 rhs_type s = math::zero<rhs_type>();
1349 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
1350 ptrdiff_t c = A_loc.col[j];
1351 value_type v = A_loc.val[j];
1352 if (scale && c == i)
1357 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
1358 s += A_rem.val[j] * b0_recv[rem_col[j]];
1361 s = math::inverse(dia) * s;
1363 t_norm += math::norm(math::inner_product(s, s));
1364 t_radi += math::norm(math::inner_product(s, b0[i]));
1371 atomic_b1_loc_norm += t_norm;
1372 atomic_loc_radius += t_radi;
1375 scalar_type b1_loc_norm = atomic_b1_loc_norm;
1376 scalar_type loc_radius = atomic_loc_radius;
1378 radius = comm.reduceSum(loc_radius);
1380 if (++iter < power_iters) {
1381 scalar_type b1_norm;
1382 b1_norm = comm.reduceSum(b1_loc_norm);
1385 b1_norm = 1 /
sqrt(b1_norm);
1387 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1388 b0[i] = b1_norm * b1[i];
1392 for (
size_t i = 0, m = C.send.count(); i < m; ++i)
1393 b0_send[i] = b0[C.send.col[i]];
1394 C.exchange(b0_send.data(), b0_recv.data());
1398 ARCCORE_ALINA_TOC(
"spectral radius");
1400 return radius < 0 ? static_cast<scalar_type>(2) : radius;
Call to handle communication pattern.
Distributed Matrix using message passing.
NUMA-aware vector container.
static Int32 maxAllowedThread()
Nombre maximum de threads autorisés pour le multi-threading.
Informations d'exécution d'une boucle.
Matrix class, to be used by user.
static Int32 currentTaskThreadIndex()
Indice (entre 0 et nbAllowedThread()-1) du thread exécutant la tâche actuelle.
Vecteur 1D de données avec sémantique par valeur (style STL).
Vector class, to be used by user.
__host__ __device__ DataType doAtomic(DataType *ptr, ValueType value)
Applique l'opération atomique Operation à la valeur à l'adresse ptr avec la valeur value.
__host__ __device__ double sqrt(double v)
Racine carrée de v.
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applique en concurrence la fonction lambda lambda_function sur l'intervalle d'itération donné par loo...
std::int32_t Int32
Type entier signé sur 32 bits.
Implementation for residual error compuatation.
Implementation for function returning the number of rows in a matrix.
Implementation for matrix-vector product.
Convenience wrapper around MPI_Comm.
std::vector< T > exclusive_sum(T n) const
Exclusive sum over mpi communicator.