Arcane  v4.1.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
DistributedCoarsening.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* DistributedCoarsening.h (C) 2000-2026 */
9/* */
10/* Distributed coarsening algorithms. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ALINA_MPI_DISTRIBUTEDCOARSENING_H
13#define ARCCORE_ALINA_MPI_DISTRIBUTEDCOARSENING_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16/*
17 * This file is based on the work on AMGCL library (version march 2026)
18 * which can be found at https://github.com/ddemidov/amgcl.
19 *
20 * Copyright (c) 2012-2022 Denis Demidov <dennis.demidov@gmail.com>
21 * SPDX-License-Identifier: MIT
22 */
23/*---------------------------------------------------------------------------*/
24/*---------------------------------------------------------------------------*/
25
26#include <cstddef>
27#include <tuple>
28#include <memory>
29#include <numeric>
30#include <cassert>
31
32#include "arccore/alina/BuiltinBackend.h"
33#include "arccore/alina/AlinaUtils.h"
34#include "arccore/alina/Coarsening.h"
35#include "arccore/alina/MessagePassingUtils.h"
36#include "arccore/alina/DistributedMatrix.h"
37
38/*---------------------------------------------------------------------------*/
39/*---------------------------------------------------------------------------*/
40
41namespace Arcane::Alina
42{
43
44/*---------------------------------------------------------------------------*/
45/*---------------------------------------------------------------------------*/
49template <class Backend>
50struct DistributedPMISAggregation
51{
52 typedef typename Backend::value_type value_type;
53 typedef typename math::scalar_of<value_type>::type scalar_type;
54 typedef DistributedMatrix<Backend> matrix;
55 typedef CommunicationPattern<Backend> CommPattern;
56 using build_matrix = Backend::matrix;
57 using col_type = Backend::col_type;
58 using ptr_type = Backend::ptr_type;
60 using bool_matrix = bool_backend::matrix;
61
62 struct params
63 {
66
67 // Strong connectivity threshold
68 double eps_strong = 0.08;
69
70 // Block size for non-scalar problems.
71 Int32 block_size = 1;
72
73 params() = default;
74
75 params(const PropertyTree& p)
76 : ARCCORE_ALINA_PARAMS_IMPORT_CHILD(p, nullspace)
77 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, eps_strong)
78 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, block_size)
79 {
80 p.check_params({ "nullspace", "eps_strong", "block_size" });
81 }
82
83 void get(PropertyTree& p, const std::string& path) const
84 {
85 ARCCORE_ALINA_PARAMS_EXPORT_CHILD(p, path, nullspace);
86 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, eps_strong);
87 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, block_size);
88 }
89
90 } & prm;
91
92 std::shared_ptr<DistributedMatrix<bool_backend>> conn;
93 std::shared_ptr<matrix> p_tent;
94
95 DistributedPMISAggregation(const matrix& A, params& prm)
96 : prm(prm)
97 {
98 ptrdiff_t n = A.loc_rows();
99 std::vector<ptrdiff_t> state(n);
100 std::vector<int> owner(n);
101
102 if (prm.block_size == 1) {
103 conn = conn_strength(A, prm.eps_strong);
104
105 ptrdiff_t naggr = aggregates(*conn, state, owner);
106 p_tent = tentative_prolongation(A.comm(), n, naggr, state, owner);
107 }
108 else {
109 typedef typename math::scalar_of<value_type>::type scalar;
110 using sbackend = BuiltinBackend<scalar, col_type, ptr_type>;
111
112 ptrdiff_t np = n / prm.block_size;
113
114 assert(np * prm.block_size == n && "Matrix size should be divisible by block_size");
115
116 DistributedMatrix<sbackend> A_pw(A.comm(),
117 pointwise_matrix(*A.local(), prm.block_size),
118 pointwise_matrix(*A.remote(), prm.block_size));
119
120 auto conn_pw = conn_strength(A_pw, prm.eps_strong);
121
122 std::vector<ptrdiff_t> state_pw(np);
123 std::vector<int> owner_pw(np);
124
125 ptrdiff_t naggr = aggregates(*conn_pw, state_pw, owner_pw);
126
127 conn = std::make_shared<DistributedMatrix<bool_backend>>(A.comm(),
128 expand_conn(*A.local(), *A_pw.local(), *conn_pw->local(), prm.block_size),
129 expand_conn(*A.remote(), *A_pw.remote(), *conn_pw->remote(), prm.block_size));
130
131 arccoreParallelFor(0, np, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
132 for (ptrdiff_t ip = begin; ip < (begin + size); ++ip) {
133 ptrdiff_t i = ip * prm.block_size;
134 ptrdiff_t s = state_pw[ip];
135 int o = owner_pw[ip];
136
137 for (unsigned k = 0; k < prm.block_size; ++k) {
138 state[i + k] = (s < 0) ? s : (s * prm.block_size + k);
139 owner[i + k] = o;
140 }
141 }
142 });
143
144 p_tent = tentative_prolongation(A.comm(), n, naggr * prm.block_size, state, owner);
145 }
146 }
147
148 std::shared_ptr<DistributedMatrix<bool_backend>>
149 squared_interface(const DistributedMatrix<bool_backend>& A)
150 {
151 const CommunicationPattern<bool_backend>& C = A.cpat();
152
153 bool_matrix& A_loc = *A.local();
154 bool_matrix& A_rem = *A.remote();
155
156 ptrdiff_t A_rows = A.loc_rows();
157
158 ptrdiff_t A_beg = A.loc_col_shift();
159 ptrdiff_t A_end = A_beg + A_rows;
160
161 auto a_nbr = remote_rows(C, A, false);
162 bool_matrix& A_nbr = *a_nbr;
163
164 // Build mapping from global to local column numbers in the remote part of
165 // the square matrix.
166 std::vector<ptrdiff_t> rem_cols(A_rem.nbNonZero() + A_nbr.nbNonZero());
167
168 std::copy(A_nbr.col.data(), A_nbr.col.data() + A_nbr.nbNonZero(),
169 std::copy(A_rem.col.data(), A_rem.col.data() + A_rem.nbNonZero(), rem_cols.begin()));
170
171 std::sort(rem_cols.begin(), rem_cols.end());
172 rem_cols.erase(std::unique(rem_cols.begin(), rem_cols.end()), rem_cols.end());
173
174 ptrdiff_t n_rem_cols = 0;
175 std::unordered_map<ptrdiff_t, int> rem_idx(2 * rem_cols.size());
176 for (ptrdiff_t c : rem_cols) {
177 if (c >= A_beg && c < A_end)
178 continue;
179 rem_idx[c] = n_rem_cols++;
180 }
181
182 // Build the product.
183 auto s_loc = std::make_shared<bool_matrix>();
184 auto s_rem = std::make_shared<bool_matrix>();
185
186 bool_matrix& S_loc = *s_loc;
187 bool_matrix& S_rem = *s_rem;
188
189 S_loc.set_size(A_rows, A_rows, false);
190 S_rem.set_size(A_rows, 0, false);
191
192 S_loc.ptr[0] = 0;
193 S_rem.ptr[0] = 0;
194
195 ARCCORE_ALINA_TIC("analyze");
196 arccoreParallelFor(0, A_rows, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
197 std::vector<ptrdiff_t> loc_marker(A_rows, -1);
198 std::vector<ptrdiff_t> rem_marker(n_rem_cols, -1);
199
200 for (ptrdiff_t ia = begin; ia < (begin + size); ++ia) {
201 ptrdiff_t loc_cols = 0;
202 ptrdiff_t rem_cols = 0;
203
204 for (ptrdiff_t ja = A_rem.ptr[ia], ea = A_rem.ptr[ia + 1]; ja < ea; ++ja) {
205 ptrdiff_t ca = C.local_index(A_rem.col[ja]);
206
207 for (ptrdiff_t jb = A_nbr.ptr[ca], eb = A_nbr.ptr[ca + 1]; jb < eb; ++jb) {
208 ptrdiff_t cb = A_nbr.col[jb];
209
210 if (cb >= A_beg && cb < A_end) {
211 cb -= A_beg;
212
213 if (loc_marker[cb] != ia) {
214 loc_marker[cb] = ia;
215 ++loc_cols;
216 }
217 }
218 else {
219 cb = rem_idx[cb];
220
221 if (rem_marker[cb] != ia) {
222 rem_marker[cb] = ia;
223 ++rem_cols;
224 }
225 }
226 }
227 }
228
229 for (ptrdiff_t ja = A_loc.ptr[ia], ea = A_loc.ptr[ia + 1]; ja < ea; ++ja) {
230 ptrdiff_t ca = A_loc.col[ja];
231
232 for (ptrdiff_t jb = A_rem.ptr[ca], eb = A_rem.ptr[ca + 1]; jb < eb; ++jb) {
233 ptrdiff_t cb = rem_idx[A_rem.col[jb]];
234
235 if (rem_marker[cb] != ia) {
236 rem_marker[cb] = ia;
237 ++rem_cols;
238 }
239 }
240 }
241
242 if (rem_cols) {
243 for (ptrdiff_t ja = A_loc.ptr[ia], ea = A_loc.ptr[ia + 1]; ja < ea; ++ja) {
244 ptrdiff_t ca = A_loc.col[ja];
245
246 for (ptrdiff_t jb = A_loc.ptr[ca], eb = A_loc.ptr[ca + 1]; jb < eb; ++jb) {
247 ptrdiff_t cb = A_loc.col[jb];
248
249 if (loc_marker[cb] != ia) {
250 loc_marker[cb] = ia;
251 ++loc_cols;
252 }
253 }
254 }
255 }
256
257 S_rem.ptr[ia + 1] = rem_cols;
258 S_loc.ptr[ia + 1] = rem_cols ? loc_cols : 0;
259 }
260 });
261 ARCCORE_ALINA_TOC("analyze");
262
263 S_loc.set_nonzeros(S_loc.scan_row_sizes(), false);
264 S_rem.set_nonzeros(S_rem.scan_row_sizes(), false);
265
266 ARCCORE_ALINA_TIC("compute");
267 arccoreParallelFor(0, A_rows, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
268 std::vector<ptrdiff_t> loc_marker(A_rows, -1);
269 std::vector<ptrdiff_t> rem_marker(n_rem_cols, -1);
270
271 for (ptrdiff_t ia = begin; ia < (begin + size); ++ia) {
272 ptrdiff_t loc_beg = S_loc.ptr[ia];
273 ptrdiff_t rem_beg = S_rem.ptr[ia];
274 ptrdiff_t loc_end = loc_beg;
275 ptrdiff_t rem_end = rem_beg;
276
277 if (rem_beg == S_rem.ptr[ia + 1])
278 continue;
279
280 for (ptrdiff_t ja = A_loc.ptr[ia], ea = A_loc.ptr[ia + 1]; ja < ea; ++ja) {
281 ptrdiff_t ca = A_loc.col[ja];
282
283 for (ptrdiff_t jb = A_loc.ptr[ca], eb = A_loc.ptr[ca + 1]; jb < eb; ++jb) {
284 ptrdiff_t cb = A_loc.col[jb];
285
286 if (loc_marker[cb] < loc_beg) {
287 loc_marker[cb] = loc_end;
288 S_loc.col[loc_end] = cb;
289 ++loc_end;
290 }
291 }
292
293 for (ptrdiff_t jb = A_rem.ptr[ca], eb = A_rem.ptr[ca + 1]; jb < eb; ++jb) {
294 ptrdiff_t gb = A_rem.col[jb];
295 ptrdiff_t cb = rem_idx[gb];
296
297 if (rem_marker[cb] < rem_beg) {
298 rem_marker[cb] = rem_end;
299 S_rem.col[rem_end] = gb;
300 ++rem_end;
301 }
302 }
303 }
304
305 for (ptrdiff_t ja = A_rem.ptr[ia], ea = A_rem.ptr[ia + 1]; ja < ea; ++ja) {
306 ptrdiff_t ca = C.local_index(A_rem.col[ja]);
307
308 for (ptrdiff_t jb = A_nbr.ptr[ca], eb = A_nbr.ptr[ca + 1]; jb < eb; ++jb) {
309 ptrdiff_t gb = A_nbr.col[jb];
310
311 if (gb >= A_beg && gb < A_end) {
312 ptrdiff_t cb = gb - A_beg;
313
314 if (loc_marker[cb] < loc_beg) {
315 loc_marker[cb] = loc_end;
316 S_loc.col[loc_end] = cb;
317 ++loc_end;
318 }
319 }
320 else {
321 ptrdiff_t cb = rem_idx[gb];
322
323 if (rem_marker[cb] < rem_beg) {
324 rem_marker[cb] = rem_end;
325 S_rem.col[rem_end] = gb;
326 ++rem_end;
327 }
328 }
329 }
330 }
331 }
332 });
333 ARCCORE_ALINA_TOC("compute");
334
335 return std::make_shared<DistributedMatrix<bool_backend>>(A.comm(), s_loc, s_rem);
336 }
337
338 template <class B>
339 std::shared_ptr<DistributedMatrix<bool_backend>>
340 conn_strength(const DistributedMatrix<B>& A, scalar_type eps_strong)
341 {
342 typedef typename B::value_type val_type;
343 typedef CSRMatrix<val_type> B_matrix;
344
345 ARCCORE_ALINA_TIC("conn_strength");
346 ptrdiff_t n = A.loc_rows();
347
348 const B_matrix& A_loc = *A.local();
349 const B_matrix& A_rem = *A.remote();
350 const CommunicationPattern<B>& C = A.cpat();
351
352 scalar_type eps_squared = eps_strong * eps_strong;
353
354 auto d = diagonal(A_loc);
355 numa_vector<val_type>& D = *d;
356
357 std::vector<val_type> D_loc(C.send.count());
358 std::vector<val_type> D_rem(C.recv.count());
359
360 for (size_t i = 0, nv = C.send.count(); i < nv; ++i)
361 D_loc[i] = D[C.send.col[i]];
362
363 C.exchange(&D_loc[0], &D_rem[0]);
364
365 auto s_loc = std::make_shared<bool_matrix>();
366 auto s_rem = std::make_shared<bool_matrix>();
367
368 bool_matrix& S_loc = *s_loc;
369 bool_matrix& S_rem = *s_rem;
370
371 S_loc.set_size(n, n, true);
372 S_rem.set_size(n, 0, true);
373
374 S_loc.val.resize(A_loc.nbNonZero());
375 S_rem.val.resize(A_rem.nbNonZero());
376
377 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
378 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
379 val_type eps_dia_i = eps_squared * D[i];
380
381 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
382 ptrdiff_t c = A_loc.col[j];
383 val_type v = A_loc.val[j];
384
385 if ((S_loc.val[j] = (c == i || (eps_dia_i * D[c] < v * v))))
386 ++S_loc.ptr[i + 1];
387 }
388
389 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j) {
390 ptrdiff_t c = C.local_index(A_rem.col[j]);
391 val_type v = A_rem.val[j];
392
393 if ((S_rem.val[j] = (eps_dia_i * D_rem[c] < v * v)))
394 ++S_rem.ptr[i + 1];
395 }
396 }
397 });
398
399 S_loc.setNbNonZero(S_loc.scan_row_sizes());
400 S_rem.setNbNonZero(S_rem.scan_row_sizes());
401
402 S_loc.col.resize(S_loc.nbNonZero());
403 S_rem.col.resize(S_rem.nbNonZero());
404
405 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
406 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
407 ptrdiff_t loc_head = S_loc.ptr[i];
408 ptrdiff_t rem_head = S_rem.ptr[i];
409
410 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j)
411 if (S_loc.val[j])
412 S_loc.col[loc_head++] = A_loc.col[j];
413
414 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
415 if (S_rem.val[j])
416 S_rem.col[rem_head++] = A_rem.col[j];
417 }
418 });
419 ARCCORE_ALINA_TOC("conn_strength");
420
421 return std::make_shared<DistributedMatrix<bool_backend>>(A.comm(), s_loc, s_rem);
422 }
423
424 ptrdiff_t aggregates(const DistributedMatrix<bool_backend>& A,
425 std::vector<ptrdiff_t>& loc_state,
426 std::vector<int>& loc_owner)
427 {
428 ARCCORE_ALINA_TIC("PMIS");
429 static const int tag_exc_cnt = 4001;
430 static const int tag_exc_pts = 4002;
431
432 const bool_matrix& A_loc = *A.local();
433 const bool_matrix& A_rem = *A.remote();
434
435 ptrdiff_t n = A_loc.nbRow();
436
437 mpi_communicator comm = A.comm();
438
439 // 1. Get symbolic square of the connectivity matrix.
440 ARCCORE_ALINA_TIC("symbolic square");
441 auto S = squared_interface(A);
442 const bool_matrix& S_loc = *S->local();
443 const bool_matrix& S_rem = *S->remote();
444 const CommunicationPattern<bool_backend>& Sp = S->cpat();
445 ARCCORE_ALINA_TOC("symbolic square");
446
447 // 2. Apply PMIS algorithm to the symbolic square.
448 ptrdiff_t n_undone = 0;
449 std::vector<ptrdiff_t> rem_state(Sp.recv.count(), DistributedPMISAggregation::undone);
450 std::vector<int> rem_owner(Sp.recv.count(), -1);
451 std::vector<ptrdiff_t> send_state(Sp.send.count());
452 std::vector<int> send_owner(Sp.send.count());
453
454 // Remove lonely nodes.
455 std::atomic<ptrdiff_t> atomic_n_undone = 0;
456 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
457 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
458 ptrdiff_t wl = A_loc.ptr[i + 1] - A_loc.ptr[i];
459 ptrdiff_t wr = S_rem.ptr[i + 1] - S_rem.ptr[i];
460
461 if (wl + wr == 1) {
462 loc_state[i] = DistributedPMISAggregation::deleted;
463 ++atomic_n_undone;
464 }
465 else {
466 loc_state[i] = DistributedPMISAggregation::undone;
467 }
468
469 loc_owner[i] = -1;
470 }
471 });
472
473 n_undone = n - atomic_n_undone;
474
475 // Exchange state
476 for (ptrdiff_t i = 0, m = Sp.send.count(); i < m; ++i)
477 send_state[i] = loc_state[Sp.send.col[i]];
478 Sp.exchange(&send_state[0], &rem_state[0]);
479
480 std::vector<std::vector<ptrdiff_t>> send_pts(Sp.recv.nbr.size());
481 std::vector<ptrdiff_t> recv_pts;
482
483 UniqueArray<MessagePassing::Request> send_cnt_req(Sp.recv.nbr.size());
484 UniqueArray<MessagePassing::Request> send_pts_req(Sp.recv.nbr.size());
485
486 ptrdiff_t naggr = 0;
487
488 std::vector<ptrdiff_t> nbr;
489
490 while (true) {
491 for (size_t i = 0; i < Sp.recv.nbr.size(); ++i)
492 send_pts[i].clear();
493
494 if (n_undone) {
495 for (ptrdiff_t i = 0; i < n; ++i) {
496 if (loc_state[i] != DistributedPMISAggregation::undone)
497 continue;
498
499 if (S_rem.ptr[i + 1] > S_rem.ptr[i]) {
500 // Boundary points
501 bool selectable = true;
502 for (ptrdiff_t j = S_rem.ptr[i], e = S_rem.ptr[i + 1]; j < e; ++j) {
503 int d, c;
504 std::tie(d, c) = Sp.remote_info(S_rem.col[j]);
505
506 if (rem_state[c] == DistributedPMISAggregation::undone && Sp.recv.nbr[d] > comm.rank) {
507 selectable = false;
508 break;
509 }
510 }
511
512 if (!selectable)
513 continue;
514
515 ptrdiff_t id = naggr++;
516 loc_owner[i] = comm.rank;
517 loc_state[i] = id;
518 --n_undone;
519
520 // A gives immediate neighbors
521 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
522 ptrdiff_t c = A_loc.col[j];
523 if (c != i) {
524 if (loc_state[c] == DistributedPMISAggregation::undone)
525 --n_undone;
526 loc_owner[c] = comm.rank;
527 loc_state[c] = id;
528 }
529 }
530
531 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j) {
532 ptrdiff_t c = A_rem.col[j];
533 int d, k;
534 std::tie(d, k) = Sp.remote_info(c);
535
536 rem_state[k] = id;
537
538 send_pts[d].push_back(c);
539 send_pts[d].push_back(id);
540 }
541
542 // S gives removed neighbors
543 for (ptrdiff_t j = S_loc.ptr[i], e = S_loc.ptr[i + 1]; j < e; ++j) {
544 ptrdiff_t c = S_loc.col[j];
545 if (c != i && loc_state[c] == DistributedPMISAggregation::undone) {
546 loc_owner[c] = comm.rank;
547 loc_state[c] = id;
548 --n_undone;
549 }
550 }
551
552 for (ptrdiff_t j = S_rem.ptr[i], e = S_rem.ptr[i + 1]; j < e; ++j) {
553 ptrdiff_t c = S_rem.col[j];
554 int d, k;
555 std::tie(d, k) = Sp.remote_info(c);
556
557 if (rem_state[k] == DistributedPMISAggregation::undone) {
558 rem_state[k] = id;
559 send_pts[d].push_back(c);
560 send_pts[d].push_back(id);
561 }
562 }
563 }
564 else {
565 // Inner points
566 ptrdiff_t id = naggr++;
567 loc_owner[i] = comm.rank;
568 loc_state[i] = id;
569 --n_undone;
570
571 nbr.clear();
572
573 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
574 ptrdiff_t c = A_loc.col[j];
575
576 if (c != i && loc_state[c] != DistributedPMISAggregation::deleted) {
577 if (loc_state[c] == DistributedPMISAggregation::undone)
578 --n_undone;
579 loc_owner[c] = comm.rank;
580 loc_state[c] = id;
581 nbr.push_back(c);
582 }
583 }
584
585 for (ptrdiff_t k : nbr) {
586 for (ptrdiff_t j = A_loc.ptr[k], e = A_loc.ptr[k + 1]; j < e; ++j) {
587 ptrdiff_t c = A_loc.col[j];
588 if (c != k && loc_state[c] == DistributedPMISAggregation::undone) {
589 loc_owner[c] = comm.rank;
590 loc_state[c] = id;
591 --n_undone;
592 }
593 }
594 }
595 }
596 }
597 }
598
599 for (size_t i = 0; i < Sp.recv.nbr.size(); ++i) {
600 int npts = send_pts[i].size();
601 send_cnt_req[i] = comm.doISend(&npts, 1, Sp.recv.nbr[i], tag_exc_cnt);
602
603 if (!npts)
604 continue;
605 send_pts_req[i] = comm.doISend(&send_pts[i][0], npts, Sp.recv.nbr[i], tag_exc_pts);
606 }
607
608 for (size_t i = 0; i < Sp.send.nbr.size(); ++i) {
609 int npts;
610 comm.doReceive(&npts, 1, Sp.send.nbr[i], tag_exc_cnt);
611
612 if (!npts)
613 continue;
614 recv_pts.resize(npts);
615 comm.doReceive(&recv_pts[0], npts, Sp.send.nbr[i], tag_exc_pts);
616
617 for (int k = 0; k < npts; k += 2) {
618 ptrdiff_t c = recv_pts[k] - Sp.loc_col_shift();
619 ptrdiff_t id = recv_pts[k + 1];
620
621 if (loc_state[c] == DistributedPMISAggregation::undone)
622 --n_undone;
623
624 loc_owner[c] = Sp.send.nbr[i];
625 loc_state[c] = id;
626 }
627 }
628
629 for (size_t i = 0; i < Sp.recv.nbr.size(); ++i) {
630 int npts = send_pts[i].size();
631 comm.wait(send_cnt_req[i]);
632 if (npts == 0)
633 continue;
634 comm.wait(send_pts_req[i]);
635 }
636
637 for (ptrdiff_t i = 0, m = Sp.send.count(); i < m; ++i)
638 send_state[i] = loc_state[Sp.send.col[i]];
639 Sp.exchange(&send_state[0], &rem_state[0]);
640
641 if (0 == comm.reduceSum(n_undone))
642 break;
643 }
644
645 // Some of the aggregates could potentially vanish during expansion
646 // step (*) above. We need to exclude those and renumber the rest.
647 ARCCORE_ALINA_TIC("drop empty aggregates");
648 for (ptrdiff_t i = 0, m = Sp.send.count(); i < m; ++i)
649 send_owner[i] = loc_owner[Sp.send.col[i]];
650 Sp.exchange(&send_owner[0], &rem_owner[0]);
651
652 std::vector<ptrdiff_t> new_id(naggr + 1, 0);
653 for (ptrdiff_t i = 0; i < n; ++i) {
654 if (loc_owner[i] == comm.rank && loc_state[i] >= 0)
655 new_id[loc_state[i] + 1] = 1;
656 }
657
658 for (size_t i = 0; i < Sp.recv.count(); ++i) {
659 if (rem_owner[i] == comm.rank && rem_state[i] >= 0)
660 new_id[rem_state[i] + 1] = 1;
661 }
662
663 std::partial_sum(new_id.begin(), new_id.end(), new_id.begin());
664
665 if (comm.reduceSum(naggr - new_id.back()) > 0) {
666 naggr = new_id.back();
667
668 for (ptrdiff_t i = 0; i < n; ++i) {
669 if (loc_owner[i] == comm.rank && loc_state[i] >= 0) {
670 loc_state[i] = new_id[loc_state[i]];
671 }
672 }
673
674 for (size_t i = 0; i < Sp.recv.nbr.size(); ++i) {
675 send_pts[i].clear();
676 }
677
678 for (auto p = Sp.remote_begin(); p != Sp.remote_end(); ++p) {
679 ptrdiff_t c = p->first;
680
681 int d, k;
682 std::tie(d, k) = p->second;
683
684 if (rem_owner[k] == comm.rank && rem_state[k] >= 0) {
685 send_pts[d].push_back(c);
686 send_pts[d].push_back(new_id[rem_state[k]]);
687 }
688 }
689
690 for (size_t i = 0; i < Sp.recv.nbr.size(); ++i) {
691 int npts = send_pts[i].size();
692 send_cnt_req[i] = comm.doISend(&npts, 1, Sp.recv.nbr[i], tag_exc_cnt);
693
694 if (!npts)
695 continue;
696 send_pts_req[i] = comm.doISend(&send_pts[i][0], npts, Sp.recv.nbr[i], tag_exc_pts);
697 }
698
699 for (size_t i = 0; i < Sp.send.nbr.size(); ++i) {
700 int npts;
701 comm.doReceive(&npts, 1, Sp.send.nbr[i], tag_exc_cnt);
702
703 if (!npts)
704 continue;
705 recv_pts.resize(npts);
706 comm.doReceive(&recv_pts[0], npts, Sp.send.nbr[i], tag_exc_pts);
707
708 for (int k = 0; k < npts; k += 2) {
709 ptrdiff_t c = recv_pts[k] - Sp.loc_col_shift();
710 ptrdiff_t id = recv_pts[k + 1];
711
712 loc_state[c] = id;
713 }
714 }
715
716 for (size_t i = 0; i < Sp.recv.nbr.size(); ++i) {
717 int npts = send_pts[i].size();
718 comm.wait(send_cnt_req[i]);
719 if (!npts)
720 continue;
721 comm.wait(send_pts_req[i]);
722 }
723 }
724
725 ARCCORE_ALINA_TOC("drop empty aggregates");
726 ARCCORE_ALINA_TOC("PMIS");
727
728 return naggr;
729 }
730
731 std::shared_ptr<matrix>
732 tentative_prolongation(mpi_communicator comm, ptrdiff_t n, ptrdiff_t naggr,
733 std::vector<ptrdiff_t>& state, std::vector<int>& owner)
734 {
735 auto p_loc = std::make_shared<build_matrix>();
736 auto p_rem = std::make_shared<build_matrix>();
737 build_matrix& P_loc = *p_loc;
738 build_matrix& P_rem = *p_rem;
739
740 ARCCORE_ALINA_TIC("tentative prolongation");
741
742 if (int null_cols = prm.nullspace.cols) {
743 ptrdiff_t nba = naggr / prm.block_size;
744
745 std::vector<ptrdiff_t> fdom = comm.exclusive_sum(n);
746 std::vector<ptrdiff_t> cdom = comm.exclusive_sum(naggr);
747
748 std::vector<int> scounts(comm.size, 0);
749 std::vector<int> rcounts(comm.size);
750
751 // Precompute the shape of the prolongation operator.
752 // Each row contains exactly nullspace.cols non-zero entries.
753 // Rows that do not belong to any aggregate are empty.
754 P_loc.set_size(n, null_cols * nba, true);
755 P_rem.set_size(n, 0, true);
756
757 // Also count the number of local DOFs in local aggregates
758 ptrdiff_t loc_dofs = 0;
759
760 for (ptrdiff_t i = 0; i < n; ++i) {
761 if (state[i] == DistributedPMISAggregation::deleted)
762 continue;
763
764 if (owner[i] == comm.rank) {
765 P_loc.ptr[i + 1] = null_cols;
766 ++loc_dofs;
767 }
768 else {
769 P_rem.ptr[i + 1] = null_cols;
770 ++scounts[owner[i]];
771 }
772 }
773
774 // Setup the exchange
775 MPI_Request req;
776 MPI_Ialltoall(scounts.data(), 1, MPI_INT,
777 rcounts.data(), 1, MPI_INT,
778 comm, &req);
779
780 P_loc.set_nonzeros(P_loc.scan_row_sizes());
781 P_rem.set_nonzeros(P_rem.scan_row_sizes());
782
783 MPI_Wait(&req, MPI_STATUS_IGNORE);
784
785 int snbr = 0;
786 int rnbr = 0;
787 for (int i = 0; i < comm.size; ++i) {
788 if (scounts[i])
789 ++snbr;
790 if (rcounts[i])
791 ++rnbr;
792 }
793
794 std::vector<int> send_nbr;
795 send_nbr.reserve(snbr);
796 std::vector<int> recv_nbr;
797 recv_nbr.reserve(rnbr);
798 std::vector<int> send_ptr;
799 send_ptr.reserve(snbr + 1);
800 send_ptr.push_back(0);
801 std::vector<int> recv_ptr;
802 recv_ptr.reserve(rnbr + 1);
803 recv_ptr.push_back(0);
804
805 for (int i = 0; i < comm.size; ++i) {
806 if (scounts[i]) {
807 send_nbr.push_back(i);
808 send_ptr.push_back(send_ptr.back() + scounts[i]);
809 }
810 if (rcounts[i]) {
811 recv_nbr.push_back(i);
812 recv_ptr.push_back(recv_ptr.back() + rcounts[i]);
813 }
814 }
815
816 int send_dofs = send_ptr.back();
817 int recv_dofs = recv_ptr.back();
818
819 std::vector<ptrdiff_t> send_agg(send_dofs); // IDs of the aggregates we are sending
820 std::vector<ptrdiff_t> send_dof(send_dofs); // DOFs included in the aggregates
821 std::vector<double> send_row(send_dofs * null_cols); // Rows of the nullspace matrix corresponding to the DOFs
822
823 std::vector<ptrdiff_t> recv_agg(recv_dofs); // IDs of the aggregates we are receiving
824 std::vector<ptrdiff_t> recv_dof(recv_dofs); // DOFs included in the aggregates
825 std::vector<double> recv_row(recv_dofs * null_cols); // Rows of the nullspace matrix corresponding to the DOFs
826
827 // Prepare the data to send
828 std::vector<ptrdiff_t> send_rank_ptr(comm.size + 1);
829 send_rank_ptr[0] = 0;
830 std::partial_sum(scounts.begin(), scounts.end(), send_rank_ptr.begin() + 1);
831 for (ptrdiff_t i = 0; i < n; ++i) {
832 auto s = state[i];
833 auto o = owner[i];
834
835 if (s == DistributedPMISAggregation::deleted)
836 continue;
837 if (o == comm.rank)
838 continue;
839
840 auto head = send_rank_ptr[o]++;
841
842 send_agg[head] = s;
843 send_dof[head] = i + fdom[comm.rank];
844 std::copy_n(&prm.nullspace.B[i * null_cols], null_cols, &send_row[head * null_cols]);
845 }
846
847 // Exchange the data
848 UniqueArray<MessagePassing::Request> send_req(3 * snbr);
849 UniqueArray<MessagePassing::Request> recv_req(3 * rnbr);
850
851 for (int i = 0; i < rnbr; ++i) {
852 int n = recv_nbr[i];
853 int p = recv_ptr[i];
854 int w = recv_ptr[i + 1] - p;
855
856 MessagePassing::Request* req = &recv_req[3 * i];
857
858 req[0] = comm.doIReceive(&recv_agg[p], w, n, tag_exc_agg);
859 req[1] = comm.doIReceive(&recv_dof[p], w, n, tag_exc_dof);
860 req[2] = comm.doIReceive(&recv_row[null_cols * p], null_cols * w, n, tag_exc_row);
861 }
862
863 for (int i = 0; i < snbr; ++i) {
864 int n = send_nbr[i];
865 int p = send_ptr[i];
866 int w = send_ptr[i + 1] - p;
867
868 MessagePassing::Request* req = &send_req[3 * i];
869
870 req[0] = comm.doISend(&send_agg[p], w, n, tag_exc_agg);
871 req[1] = comm.doISend(&send_dof[p], w, n, tag_exc_dof);
872 req[2] = comm.doISend(&send_row[null_cols * p], null_cols * w, n, tag_exc_row);
873 }
874
875 ARCCORE_ALINA_TIC("MPI Wait");
876 comm.waitAll(recv_req);
877 comm.waitAll(send_req);
878 ARCCORE_ALINA_TOC("MPI Wait");
879
880 // Sort the fine-level points by the aggregate number.
881 // The order vector contains tuples of (aggr, dof, src, dst),
882 // where src points to a row in B, and dst points to a row in P
883 std::vector<std::tuple<ptrdiff_t, ptrdiff_t, double*, value_type*>> order;
884 order.reserve(loc_dofs + recv_dofs);
885 for (ptrdiff_t i = 0; i < n; ++i) {
886 auto s = state[i];
887 auto o = owner[i];
888
889 if (s == DistributedPMISAggregation::deleted)
890 continue;
891 if (o != comm.rank)
892 continue;
893
894 order.emplace_back(s / prm.block_size, i + fdom[comm.rank],
895 &prm.nullspace.B[i * null_cols], &P_loc.val[P_loc.ptr[i]]);
896 }
897 for (ptrdiff_t i = 0; i < recv_dofs; ++i) {
898 order.emplace_back(recv_agg[i] / prm.block_size, recv_dof[i],
899 &recv_row[i * null_cols], nullptr);
900 }
901 std::sort(order.begin(), order.end());
902
903 std::vector<ptrdiff_t> aggr_ptr(nba + 1, 0);
904 for (size_t i = 0; i < order.size(); ++i)
905 ++aggr_ptr[std::get<0>(order[i]) + 1];
906 std::partial_sum(aggr_ptr.begin(), aggr_ptr.end(), aggr_ptr.begin());
907
908 // Compute the tentative prolongation operator and null-space vectors
909 // for the coarser level.
910 std::vector<double> Bnew;
911 Bnew.resize(nba * null_cols * null_cols);
912
913 arccoreParallelFor(0, nba, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
914 Alina::detail::QRFactorization<double> qr;
915 std::vector<double> Bpart;
916
917 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
918 auto aggr_beg = aggr_ptr[i];
919 auto aggr_end = aggr_ptr[i + 1];
920 auto d = aggr_end - aggr_beg;
921
922 Bpart.resize(d * null_cols);
923
924 for (ptrdiff_t j = aggr_beg, r = 0; j < aggr_end; ++j, ++r) {
925 auto src = std::get<2>(order[j]);
926 for (int c = 0; c < null_cols; ++c)
927 Bpart[r + d * c] = src[c];
928 }
929
930 qr.factorize(d, null_cols, &Bpart[0], Alina::detail::col_major);
931
932 for (ptrdiff_t r = 0, k = i * null_cols * null_cols; r < null_cols; ++r)
933 for (int c = 0; c < null_cols; ++c, ++k)
934 Bnew[k] = qr.R(r, c);
935
936 for (ptrdiff_t j = aggr_beg, r = 0; j < aggr_end; ++j, ++r) {
937 auto src = std::get<2>(order[j]);
938 auto dst = std::get<3>(order[j]);
939
940 if (dst) {
941 // TODO: this is just a workaround to make non-scalar value
942 // types compile. Most probably this won't actually work.
943 for (int c = 0; c < null_cols; ++c)
944 dst[c] = qr.Q(r, c) * math::identity<value_type>();
945 }
946 else {
947 for (int c = 0; c < null_cols; ++c)
948 src[c] = qr.Q(r, c);
949 }
950 }
951 }
952 });
953
954 // Exchange the computed rows of the prolongation operator with the
955 // owners.
956 for (int i = 0; i < snbr; ++i) {
957 int n = send_nbr[i];
958 int p = send_ptr[i];
959 int w = send_ptr[i + 1] - p;
960 send_req[i] = comm.doIReceive(&send_row[null_cols * p], null_cols * w, n, tag_exc_row);
961 }
962
963 for (int i = 0; i < rnbr; ++i) {
964 int n = recv_nbr[i];
965 int p = recv_ptr[i];
966 int w = recv_ptr[i + 1] - p;
967 recv_req[i] = comm.doISend(&recv_row[null_cols * p], null_cols * w, n, tag_exc_row);
968 }
969
970 // Fill column numbers
971 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
972 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
973 ptrdiff_t s = state[i];
974 if (s == DistributedPMISAggregation::deleted)
975 continue;
976
977 int d = owner[i];
978 if (d == comm.rank) {
979 auto col = &P_loc.col[P_loc.ptr[i]];
980 for (int j = 0; j < null_cols; ++j) {
981 col[j] = null_cols * s / prm.block_size + j;
982 }
983 }
984 else {
985 auto col = &P_rem.col[P_rem.ptr[i]];
986 for (int j = 0; j < null_cols; ++j) {
987 col[j] = null_cols * (s + cdom[d]) / prm.block_size + j;
988 }
989 }
990 }
991 });
992
993 ARCCORE_ALINA_TIC("MPI Wait");
994 comm.waitAll(send_req);
995 comm.waitAll(recv_req);
996 ARCCORE_ALINA_TOC("MPI Wait");
997
998 // Use the P rows computed by the neighbors
999 for (ptrdiff_t k = 0; k < send_dofs; ++k) {
1000 auto i = send_dof[k] - fdom[comm.rank];
1001 auto src = &send_row[k * null_cols];
1002 auto dst = &P_rem.val[P_rem.ptr[i]];
1003
1004 for (ptrdiff_t j = 0; j < null_cols; ++j) {
1005 dst[j] = src[j] * math::identity<value_type>();
1006 }
1007 }
1008
1009 std::swap(prm.nullspace.B, Bnew);
1010 }
1011 else {
1012 std::vector<ptrdiff_t> dom = comm.exclusive_sum(naggr);
1013
1014 P_loc.set_size(n, naggr, true);
1015 P_rem.set_size(n, 0, true);
1016
1017 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1018 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1019 if (state[i] == DistributedPMISAggregation::deleted)
1020 continue;
1021
1022 if (owner[i] == comm.rank) {
1023 ++P_loc.ptr[i + 1];
1024 }
1025 else {
1026 ++P_rem.ptr[i + 1];
1027 }
1028 }
1029 });
1030
1031 P_loc.set_nonzeros(P_loc.scan_row_sizes());
1032 P_rem.set_nonzeros(P_rem.scan_row_sizes());
1033
1034 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1035 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1036 ptrdiff_t s = state[i];
1037 if (s == DistributedPMISAggregation::deleted)
1038 continue;
1039
1040 int d = owner[i];
1041 if (d == comm.rank) {
1042 P_loc.col[P_loc.ptr[i]] = s;
1043 P_loc.val[P_loc.ptr[i]] = math::identity<value_type>();
1044 }
1045 else {
1046 P_rem.col[P_rem.ptr[i]] = s + dom[d];
1047 P_rem.val[P_rem.ptr[i]] = math::identity<value_type>();
1048 }
1049 }
1050 });
1051 }
1052 ARCCORE_ALINA_TOC("tentative prolongation");
1053
1054 return std::make_shared<matrix>(comm, p_loc, p_rem);
1055 }
1056
1057 template <class pw_matrix>
1058 std::shared_ptr<bool_matrix>
1059 expand_conn(const build_matrix& A, const pw_matrix& Ap, const bool_matrix& Cp,
1060 unsigned block_size) const
1061 {
1062 ptrdiff_t np = Cp.nbRow();
1063 ptrdiff_t n = np * block_size;
1064
1065 auto c = std::make_shared<bool_matrix>();
1066 bool_matrix& C = *c;
1067
1068 C.set_size(n, n, true);
1069 C.val.resize(A.nbNonZero());
1070
1071 arccoreParallelFor(0, np, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1072 std::vector<ptrdiff_t> j(block_size);
1073 std::vector<ptrdiff_t> e(block_size);
1074
1075 for (ptrdiff_t ip = begin; ip < (begin + size); ++ip) {
1076 ptrdiff_t ia = ip * block_size;
1077
1078 for (unsigned k = 0; k < block_size; ++k) {
1079 j[k] = A.ptr[ia + k];
1080 e[k] = A.ptr[ia + k + 1];
1081 }
1082
1083 for (ptrdiff_t jp = Ap.ptr[ip], ep = Ap.ptr[ip + 1]; jp < ep; ++jp) {
1084 ptrdiff_t cp = Ap.col[jp];
1085 bool sp = Cp.val[jp];
1086
1087 ptrdiff_t col_end = (cp + 1) * block_size;
1088
1089 for (unsigned k = 0; k < block_size; ++k) {
1090 ptrdiff_t beg = j[k];
1091 ptrdiff_t end = e[k];
1092
1093 while (beg < end && A.col[beg] < col_end) {
1094 C.val[beg++] = sp;
1095
1096 if (sp)
1097 ++C.ptr[ia + k + 1];
1098 }
1099
1100 j[k] = beg;
1101 }
1102 }
1103 }
1104 });
1105
1106 C.setNbNonZero(C.scan_row_sizes());
1107 C.col.resize(C.nbNonZero());
1108
1109 arccoreParallelFor(0, np, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1110 std::vector<ptrdiff_t> j(block_size);
1111 std::vector<ptrdiff_t> e(block_size);
1112 std::vector<ptrdiff_t> h(block_size);
1113
1114 for (ptrdiff_t ip = begin; ip < (begin + size); ++ip) {
1115 ptrdiff_t ia = ip * block_size;
1116
1117 for (unsigned k = 0; k < block_size; ++k) {
1118 j[k] = A.ptr[ia + k];
1119 e[k] = A.ptr[ia + k + 1];
1120 h[k] = C.ptr[ia + k];
1121 }
1122
1123 for (ptrdiff_t jp = Ap.ptr[ip], ep = Ap.ptr[ip + 1]; jp < ep; ++jp) {
1124 ptrdiff_t cp = Ap.col[jp];
1125 bool sp = Cp.val[jp];
1126
1127 ptrdiff_t col_end = (cp + 1) * block_size;
1128
1129 for (unsigned k = 0; k < block_size; ++k) {
1130 ptrdiff_t beg = j[k];
1131 ptrdiff_t end = e[k];
1132 ptrdiff_t hed = h[k];
1133
1134 while (beg < end && A.col[beg] < col_end) {
1135 if (sp)
1136 C.col[hed++] = A.col[beg];
1137 ++beg;
1138 }
1139
1140 j[k] = beg;
1141 h[k] = hed;
1142 }
1143 }
1144 }
1145 });
1146
1147 return c;
1148 }
1149
1150 private:
1151
1152 static const int undone = -2;
1153 static const int deleted = -1;
1154
1155 static const int tag_exc_agg = 4011;
1156 static const int tag_exc_dof = 4012;
1157 static const int tag_exc_row = 4013;
1158};
1159
1160/*---------------------------------------------------------------------------*/
1161/*---------------------------------------------------------------------------*/
1165template <class Backend>
1166struct DistributedAggregationCoarsening
1167{
1168 typedef typename Backend::value_type value_type;
1169 typedef typename math::scalar_of<value_type>::type scalar_type;
1170 using build_matrix = Backend::matrix;
1171
1172 struct params
1173 {
1174 // aggregation params
1175 typedef typename DistributedPMISAggregation<Backend>::params aggr_params;
1176 aggr_params aggr;
1177
1192 float over_interp = 1.5f;
1193
1194 params() = default;
1195
1196 params(const PropertyTree& p)
1197 : ARCCORE_ALINA_PARAMS_IMPORT_CHILD(p, aggr)
1198 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, over_interp)
1199 {
1200 p.check_params({ "aggr", "over_interp" });
1201 }
1202
1203 void get(Alina::PropertyTree& p, const std::string& path) const
1204 {
1205 ARCCORE_ALINA_PARAMS_EXPORT_CHILD(p, path, aggr);
1206 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, over_interp);
1207 }
1208 } prm;
1209
1210 DistributedAggregationCoarsening(const params& prm = params())
1211 : prm(prm)
1212 {}
1213
1214 std::tuple<std::shared_ptr<DistributedMatrix<Backend>>,
1215 std::shared_ptr<DistributedMatrix<Backend>>>
1216 transfer_operators(const DistributedMatrix<Backend>& A)
1217 {
1218 DistributedPMISAggregation<Backend> aggr(A, prm.aggr);
1219 return std::make_tuple(aggr.p_tent, transpose(*aggr.p_tent));
1220 }
1221
1222 std::shared_ptr<DistributedMatrix<Backend>>
1223 coarse_operator(const DistributedMatrix<Backend>& A,
1224 const DistributedMatrix<Backend>& P,
1225 const DistributedMatrix<Backend>& R) const
1226 {
1227 return detail::scaled_galerkin(A, P, R, 1 / prm.over_interp);
1228 }
1229};
1230
1231/*---------------------------------------------------------------------------*/
1232/*---------------------------------------------------------------------------*/
1233
1234template <class Backend>
1235unsigned block_size(const DistributedAggregationCoarsening<Backend>& c)
1236{
1237 return c.prm.aggr.block_size;
1238}
1239
1240/*---------------------------------------------------------------------------*/
1241/*---------------------------------------------------------------------------*/
1245template <class Backend>
1246struct DistributedSmoothedAggregationCoarsening
1247{
1248 typedef typename Backend::value_type value_type;
1249 typedef typename math::scalar_of<value_type>::type scalar_type;
1250 using build_matrix = Backend::matrix;
1251 using col_type = Backend::col_type;
1252 using ptr_type = Backend::ptr_type;
1253 using bool_backend = BuiltinBackend<char, col_type, ptr_type>;
1254 using bool_matrix = bool_backend::matrix;
1255
1256 struct params
1257 {
1258 // aggregation params
1259 typedef typename DistributedPMISAggregation<Backend>::params aggr_params;
1260 aggr_params aggr;
1261
1263 scalar_type relax = 1.0;
1264
1265 // Estimate the matrix spectral radius.
1266 // This usually improves convergence rate and results in faster solves,
1267 // but costs some time during setup.
1268 bool estimate_spectral_radius = false;
1269
1270 // Number of power iterations to apply for the spectral radius
1271 // estimation. Use Gershgorin disk theorem when power_iters = 0.
1272 int power_iters = 0;
1273
1274 params() = default;
1275
1276 params(const PropertyTree& p)
1277 : ARCCORE_ALINA_PARAMS_IMPORT_CHILD(p, aggr)
1278 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, relax)
1279 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, estimate_spectral_radius)
1280 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, power_iters)
1281 {
1282 p.check_params({ "aggr", "relax", "estimate_spectral_radius", "power_iters" });
1283 }
1284
1285 void get(PropertyTree& p, const std::string& path) const
1286 {
1287 ARCCORE_ALINA_PARAMS_EXPORT_CHILD(p, path, aggr);
1288 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, relax);
1289 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, estimate_spectral_radius);
1290 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, power_iters);
1291 }
1292 } prm;
1293
1294 DistributedSmoothedAggregationCoarsening(const params& prm = params())
1295 : prm(prm)
1296 {}
1297
1298 std::tuple<std::shared_ptr<DistributedMatrix<Backend>>,
1299 std::shared_ptr<DistributedMatrix<Backend>>>
1300 transfer_operators(const DistributedMatrix<Backend>& A)
1301 {
1302 typedef DistributedMatrix<Backend> DM;
1303 using build_matrix = Backend::matrix;
1304
1305 DistributedPMISAggregation<Backend> aggr(A, prm.aggr);
1306 prm.aggr.eps_strong *= 0.5;
1307
1308 mpi_communicator comm = A.comm();
1309 const build_matrix& A_loc = *A.local();
1310 const build_matrix& A_rem = *A.remote();
1311
1312 bool_matrix& S_loc = *aggr.conn->local();
1313 bool_matrix& S_rem = *aggr.conn->remote();
1314
1315 ARCCORE_ALINA_TIC("filtered matrix");
1316 ptrdiff_t n = A.loc_rows();
1317
1318 scalar_type omega = prm.relax;
1319 if (prm.estimate_spectral_radius) {
1320 omega *= static_cast<scalar_type>(4.0 / 3) / spectral_radius<true>(A, prm.power_iters);
1321 }
1322 else {
1323 omega *= static_cast<scalar_type>(2.0 / 3);
1324 }
1325
1326 auto af_loc = std::make_shared<build_matrix>();
1327 auto af_rem = std::make_shared<build_matrix>();
1328
1329 build_matrix& Af_loc = *af_loc;
1330 build_matrix& Af_rem = *af_rem;
1331
1332 numa_vector<value_type> Af_loc_val(S_loc.nbNonZero(), false);
1333 numa_vector<value_type> Af_rem_val(S_rem.nbNonZero(), false);
1334
1335 Af_loc.own_data = false;
1336 Af_loc.setNbRow(S_loc.nbRow());
1337 Af_loc.ncols = S_loc.ncols;
1338 Af_loc.setNbNonZero(S_loc.nbNonZero());
1339 Af_loc.ptr.setPointerZeroCopy(S_loc.ptr.data());
1340 Af_loc.col.setPointerZeroCopy(S_loc.col.data());
1341 Af_loc.val.setPointerZeroCopy(Af_loc_val.data());
1342
1343 Af_rem.own_data = false;
1344 Af_rem.setNbRow(S_rem.nbRow());
1345 Af_rem.ncols = S_rem.ncols;
1346 Af_rem.setNbNonZero(S_rem.nbNonZero());
1347 Af_rem.ptr.setPointerZeroCopy(S_rem.ptr.data());
1348 Af_rem.col.setPointerZeroCopy(S_rem.col.data());
1349 Af_rem.val.setPointerZeroCopy(Af_rem_val.data());
1350
1351 numa_vector<value_type> Df(n, false);
1352
1353 arccoreParallelFor(0, n, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
1354 for (ptrdiff_t i = begin; i < (begin + size); ++i) {
1355
1356 ptrdiff_t loc_head = Af_loc.ptr[i];
1357 ptrdiff_t rem_head = Af_rem.ptr[i];
1358
1359 value_type dia_f = math::zero<value_type>();
1360
1361 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j)
1362 if (A_loc.col[j] == i || !S_loc.val[j])
1363 dia_f += A_loc.val[j];
1364
1365 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j)
1366 if (!S_rem.val[j])
1367 dia_f += A_rem.val[j];
1368
1369 dia_f = -omega * math::inverse(dia_f);
1370
1371 for (ptrdiff_t j = A_loc.ptr[i], e = A_loc.ptr[i + 1]; j < e; ++j) {
1372 if (A_loc.col[j] == i) {
1373 Af_loc.val[loc_head++] = (1 - omega) * math::identity<value_type>();
1374 }
1375 else if (S_loc.val[j]) {
1376 Af_loc.val[loc_head++] = dia_f * A_loc.val[j];
1377 }
1378 }
1379
1380 for (ptrdiff_t j = A_rem.ptr[i], e = A_rem.ptr[i + 1]; j < e; ++j) {
1381 if (S_rem.val[j]) {
1382 Af_rem.val[rem_head++] = dia_f * A_rem.val[j];
1383 }
1384 }
1385 }
1386 });
1387
1388 auto Af = std::make_shared<DM>(comm, af_loc, af_rem);
1389 ARCCORE_ALINA_TOC("filtered matrix");
1390
1391 // 5. Smooth tentative prolongation with the filtered matrix.
1392 ARCCORE_ALINA_TIC("smoothing");
1393 auto P = product(*Af, *aggr.p_tent);
1394 ARCCORE_ALINA_TOC("smoothing");
1395
1396 return std::make_tuple(P, transpose(*P));
1397 }
1398
1399 std::shared_ptr<DistributedMatrix<Backend>>
1400 coarse_operator(const DistributedMatrix<Backend>& A,
1401 const DistributedMatrix<Backend>& P,
1402 const DistributedMatrix<Backend>& R) const
1403 {
1404 return detail::galerkin(A, P, R);
1405 }
1406};
1407
1408/*---------------------------------------------------------------------------*/
1409/*---------------------------------------------------------------------------*/
1410
1411template <class Backend>
1412unsigned block_size(const DistributedSmoothedAggregationCoarsening<Backend>& c)
1413{
1414 return c.prm.aggr.block_size;
1415}
1416
1417/*---------------------------------------------------------------------------*/
1418/*---------------------------------------------------------------------------*/
1419
1420} // namespace Arcane::Alina
1421
1422/*---------------------------------------------------------------------------*/
1423/*---------------------------------------------------------------------------*/
1424
1425#endif
Call to handle communication pattern.
Distributed Matrix using message passing.
__host__ __device__ Real3x3 transpose(const Real3x3 &t)
Transpose la matrice.
Definition MathUtils.h:258
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applique en concurrence la fonction lambda lambda_function sur l'intervalle d'itération donné par loo...
Definition ParallelFor.h:85
std::int32_t Int32
Type entier signé sur 32 bits.
Distributed non-smoothed aggregation coarsening scheme.
nullspace_params nullspace
Near nullspace parameters.
Distributed smoothed aggregation coarsening scheme.