Arcane  v4.1.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
ILUSolverImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* ILUSolverImpl.h (C) 2000-2026 */
9/* */
10/* Solver for obtained as a result of an incomplete LU factorization. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ALINA_ILUSOLVERIMPL_H
13#define ARCCORE_ALINA_ILUSOLVERIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16/*
17 * This file is based on the work on AMGCL library (version march 2026)
18 * which can be found at https://github.com/ddemidov/amgcl.
19 *
20 * Copyright (c) 2012-2022 Denis Demidov <dennis.demidov@gmail.com>
21 * SPDX-License-Identifier: MIT
22 */
23/*---------------------------------------------------------------------------*/
24/*---------------------------------------------------------------------------*/
25
26#include "arccore/alina/ValueTypeInterface.h"
27#include "arccore/alina/BuiltinBackend.h"
28#include "arccore/alina/HybridBuiltinBackend.h"
29#include "arccore/alina/AlinaUtils.h"
30
31/*---------------------------------------------------------------------------*/
32/*---------------------------------------------------------------------------*/
33
34namespace Arcane::Alina::Impl
35{
36
37/*---------------------------------------------------------------------------*/
38/*---------------------------------------------------------------------------*/
43template <class Backend>
44class ILUSolver
45{
46 public:
47
48 typedef typename Backend::params backend_params;
49 typedef typename Backend::value_type value_type;
50 typedef typename Backend::col_type col_type;
51 typedef typename Backend::ptr_type ptr_type;
52 typedef typename Backend::matrix matrix;
53 typedef typename Backend::vector vector;
54 typedef typename Backend::matrix_diagonal matrix_diagonal;
55 typedef typename BuiltinBackend<value_type, col_type, ptr_type>::matrix build_matrix;
56 typedef typename math::scalar_of<value_type>::type scalar_type;
57
58 struct params
59 {
62
64 double damping = 0.72;
65
66 params() = default;
67
68 params(const PropertyTree& p)
69 : ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, iters)
70 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, damping)
71 {
72 p.check_params( { "iters", "damping" });
73 }
74
75 void get(PropertyTree& p, const std::string& path) const
76 {
77 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, iters);
78 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, damping);
79 }
80
81 } prm;
82
83 public:
84
85 ILUSolver(std::shared_ptr<build_matrix> L,
86 std::shared_ptr<build_matrix> U,
87 std::shared_ptr<numa_vector<value_type>> D,
88 const params& prm = params(),
89 const backend_params& bprm = backend_params())
90 : prm(prm)
91 , L(Backend::copy_matrix(L, bprm))
92 , U(Backend::copy_matrix(U, bprm))
93 , D(Backend::copy_vector(D, bprm))
94 , t1(Backend::create_vector(backend::nbRow(*L), bprm))
95 , t2(Backend::create_vector(backend::nbRow(*L), bprm))
96 {}
97
98 template <class Vector>
99 void solve(Vector& x)
100 {
101 vector* y0 = t1.get();
102 vector* y1 = t2.get();
103
104 backend::axpby(prm.damping, x, 0.0, *y0);
105 for (unsigned i = 0; i < prm.iters; ++i) {
106 backend::residual(x, *L, *y0, *y1);
107 backend::axpby(prm.damping, *y1, (1 - prm.damping), *y0);
108 }
109
110 backend::vmul(prm.damping, *D, *y0, 0.0, x);
111 for (unsigned i = 0; i < prm.iters; ++i) {
112 backend::residual(*y0, *U, x, *y1);
113 backend::vmul(prm.damping, *D, *y1, (1 - prm.damping), x);
114 }
115 }
116
117 size_t bytes() const
118 {
119 return backend::bytes(*L) +
120 backend::bytes(*U) +
121 backend::bytes(*D) +
122 backend::bytes(*t1) +
123 backend::bytes(*t2);
124 }
125
126 private:
127
128 std::shared_ptr<matrix> L;
129 std::shared_ptr<matrix> U;
130 std::shared_ptr<matrix_diagonal> D;
131 std::shared_ptr<vector> t1, t2;
132};
133
134/*---------------------------------------------------------------------------*/
135/*---------------------------------------------------------------------------*/
142template <class value_type, class col_type, class ptr_type>
143class ILUSolver<BuiltinBackend<value_type, col_type, ptr_type>>
144{
145 public:
146
148 typedef typename Backend::params backend_params;
149 typedef typename Backend::matrix matrix;
150 typedef typename Backend::vector vector;
151 typedef typename Backend::matrix_diagonal matrix_diagonal;
152 typedef typename BuiltinBackend<value_type, col_type, ptr_type>::matrix build_matrix;
153 typedef typename Backend::rhs_type rhs_type;
154 typedef typename math::scalar_of<value_type>::type scalar_type;
155
156 struct params
157 {
159 bool serial;
160
161 params()
162 : serial(ConcurrencyBase::maxAllowedThread() < 4)
163 {}
164
165 params(const PropertyTree& p)
166 : ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, serial)
167 {
168 p.check_params({ "serial" });
169 }
170
171 void get(PropertyTree& p, const std::string& path) const
172 {
173 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, serial);
174 }
175 } prm;
176
177 ILUSolver(std::shared_ptr<build_matrix> L,
178 std::shared_ptr<build_matrix> U,
179 std::shared_ptr<numa_vector<value_type>> D,
180 const params& prm = params(),
181 const backend_params& = backend_params())
182 : prm(prm)
183 {
184 if (prm.serial)
185 serial_init(L, U, D);
186 else
187 parallel_init(L, U, D);
188 }
189
190 template <class Vector>
191 void solve(Vector& x)
192 {
193 if (prm.serial)
194 serial_solve(x);
195 else
196 parallel_solve(x);
197 }
198
199 size_t bytes() const
200 {
201 size_t b = 0;
202
203 if (L)
204 b += backend::bytes(*L);
205 if (U)
206 b += backend::bytes(*U);
207 if (D)
208 b += backend::bytes(*D);
209
210 if (lower)
211 b += lower->bytes();
212 if (upper)
213 b += upper->bytes();
214
215 return b;
216 }
217
218 private:
219
220 // copies of the input matrices for the fallback (serial)
221 // implementation:
222 std::shared_ptr<matrix> L;
223 std::shared_ptr<matrix> U;
224 std::shared_ptr<matrix_diagonal> D;
225
226 void serial_init(std::shared_ptr<build_matrix> L,
227 std::shared_ptr<build_matrix> U,
228 std::shared_ptr<matrix_diagonal> D)
229 {
230 this->L = L;
231 this->U = U;
232 this->D = D;
233 }
234
235 template <class Vector>
236 void serial_solve(Vector& x)
237 {
238 const size_t n = backend::nbRow(*L);
239
240 const matrix& L = *(this->L);
241 const matrix& U = *(this->U);
242 const matrix_diagonal& D = *(this->D);
243
244 for (size_t i = 0; i < n; i++) {
245 for (ptrdiff_t j = L.ptr[i], e = L.ptr[i + 1]; j < e; ++j)
246 x[i] -= L.val[j] * x[L.col[j]];
247 }
248
249 for (size_t i = n; i-- > 0;) {
250 for (ptrdiff_t j = U.ptr[i], e = U.ptr[i + 1]; j < e; ++j)
251 x[i] -= U.val[j] * x[U.col[j]];
252 x[i] = D[i] * x[i];
253 }
254 }
255
256 // OpenMP solver for sparse triangular systems.
257 // The solver uses level scheduling approach.
258 // Each level (a set of matrix rows that can be computed independently)
259 // is split into tasks, a task per thread, and the matrix data is
260 // distributed across threads to improve cache and NUMA locality.
261 template <bool lower>
262 struct sptr_solve
263 {
264 // a task is a set of rows that can be computed independently by a
265 // single thread.
266 struct task
267 {
268 ptrdiff_t beg, end; // rows to process
269
270 task(ptrdiff_t beg, ptrdiff_t end)
271 : beg(beg)
272 , end(end)
273 {}
274 };
275
276 int nthreads;
277
278 // thread-specific storage:
283 UniqueArray<UniqueArray<ptrdiff_t>> ord; // rows ordered by levels
285 Int32 m_nb_level = 0;
286
287 template <class Matrix>
288 sptr_solve(const Matrix& A, const value_type* _D = 0)
289 : nthreads(ConcurrencyBase::maxAllowedThread())
290 , tasks(nthreads)
291 , ptr(nthreads)
292 , col(nthreads)
293 , val(nthreads)
294 , ord(nthreads)
295 {
296 ptrdiff_t n = A.nbRow();
297 ptrdiff_t nlev = 0;
298
299 UniqueArray<ptrdiff_t> level(n, 0);
300 UniqueArray<ptrdiff_t> order(n, 0);
301
302 // 1. split rows into levels.
303 ptrdiff_t beg = lower ? 0 : n - 1;
304 ptrdiff_t end = lower ? n : -1;
305 ptrdiff_t inc = lower ? 1 : -1;
306
307 for (ptrdiff_t i = beg; i != end; i += inc) {
308 ptrdiff_t l = level[i];
309
310 for (auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j)
311 l = std::max(l, level[A.col[j]] + 1);
312
313 level[i] = l;
314 nlev = std::max(nlev, l + 1);
315 }
316 m_nb_level = nlev;
317
318 // 2. reorder matrix rows.
319 UniqueArray<ptrdiff_t> start(nlev + 1, 0);
320
321 for (ptrdiff_t i = 0; i < n; ++i)
322 ++start[level[i] + 1];
323
324 std::partial_sum(start.begin(), start.end(), start.begin());
325
326 for (ptrdiff_t i = 0; i < n; ++i)
327 order[start[level[i]]++] = i;
328
329 std::rotate(start.begin(), start.end() - 1, start.end());
330 start[0] = 0;
331
332 // 3. Organize matrix rows into tasks.
333 // Each level is split into nthreads tasks.
334 UniqueArray<ptrdiff_t> thread_rows(nthreads, 0);
335 UniqueArray<ptrdiff_t> thread_cols(nthreads, 0);
336
337 // TODO: Use a grain size of 1 to use all threads
338 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
339 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
340 tasks[tid].reserve(nlev);
341
342 for (ptrdiff_t lev = 0; lev < nlev; ++lev) {
343 // split each level into tasks.
344 ptrdiff_t lev_size = start[lev + 1] - start[lev];
345 ptrdiff_t chunk_size = (lev_size + nthreads - 1) / nthreads;
346
347 ptrdiff_t beg = std::min(tid * chunk_size, lev_size);
348 ptrdiff_t end = std::min(beg + chunk_size, lev_size);
349
350 beg += start[lev];
351 end += start[lev];
352
353 tasks[tid].push_back(task(beg, end));
354
355 // count rows and nonzeros in the current task
356 thread_rows[tid] += end - beg;
357 for (ptrdiff_t i = beg; i < end; ++i) {
358 ptrdiff_t j = order[i];
359 thread_cols[tid] += A.ptr[j + 1] - A.ptr[j];
360 }
361 }
362 }
363 });
364
365 // 4. reorganize matrix data for better cache and NUMA locality.
366 if (!lower)
367 D.resize(nthreads);
368
369 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
370 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
371
372 col[tid].reserve(thread_cols[tid]);
373 val[tid].reserve(thread_cols[tid]);
374 ord[tid].reserve(thread_rows[tid]);
375 ptr[tid].reserve(thread_rows[tid] + 1);
376 ptr[tid].push_back(0);
377
378 if (!lower)
379 D[tid].reserve(thread_rows[tid]);
380
381 for (task& t : tasks[tid]) {
382 ptrdiff_t loc_beg = ptr[tid].size() - 1;
383 ptrdiff_t loc_end = loc_beg;
384
385 for (ptrdiff_t r = t.beg; r < t.end; ++r, ++loc_end) {
386 ptrdiff_t i = order[r];
387 if (!lower)
388 D[tid].push_back(_D[i]);
389
390 ord[tid].push_back(i);
391
392 for (auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j) {
393 col[tid].push_back(A.col[j]);
394 val[tid].push_back(A.val[j]);
395 }
396
397 ptr[tid].push_back(col[tid].size());
398 }
399
400 t.beg = loc_beg;
401 t.end = loc_end;
402 }
403 }
404 });
405 }
406
407 template <class Vector>
408 void solve(Vector& x) const
409 {
410 const Int32 nb_level = m_nb_level;
411 for (Int32 lev = 0; lev < nb_level; ++lev) {
412 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
413 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
414 //for (ptrdiff_t tid = 0; tid < nthreads; ++tid) {
415 const task& t = tasks[tid][lev];
416 for (ptrdiff_t r = t.beg; r < t.end; ++r) {
417 ptrdiff_t i = ord[tid][r];
418 ptrdiff_t beg = ptr[tid][r];
419 ptrdiff_t end = ptr[tid][r + 1];
420
421 rhs_type X = math::zero<rhs_type>();
422 for (ptrdiff_t j = beg; j < end; ++j)
423 X += val[tid][j] * x[col[tid][j]];
424
425 if (lower)
426 x[i] -= X;
427 else
428 x[i] = D[tid][r] * (x[i] - X);
429 }
430 }
431 });
432 }
433 }
434
435 size_t
436 bytes() const
437 {
438 size_t b = 0;
439
440 for (int i = 0; i < nthreads; ++i) {
441 b += sizeof(task) * tasks[i].size();
442 b += backend::bytes(ptr[i]);
443 b += backend::bytes(col[i]);
444 b += backend::bytes(val[i]);
445 b += backend::bytes(ord[i]);
446
447 if (!lower)
448 b += backend::bytes(D[i]);
449 }
450
451 return b;
452 }
453 };
454
455 std::shared_ptr<sptr_solve<true>> lower;
456 std::shared_ptr<sptr_solve<false>> upper;
457
458 void parallel_init(std::shared_ptr<build_matrix> L,
459 std::shared_ptr<build_matrix> U,
460 std::shared_ptr<numa_vector<value_type>> D)
461 {
462 lower = std::make_shared<sptr_solve<true>>(*L, D->data());
463 upper = std::make_shared<sptr_solve<false>>(*U, D->data());
464 }
465
466 template <class Vector>
467 void parallel_solve(Vector& x)
468 {
469 lower->solve(x);
470 upper->solve(x);
471 }
472};
473
474/*---------------------------------------------------------------------------*/
475/*---------------------------------------------------------------------------*/
476
477template <class Block, class Col, class Ptr>
478class ILUSolver<backend::HybridBuiltinBackend<Block, Col, Ptr>>
479: public ILUSolver<BuiltinBackend<typename math::scalar_of<Block>::type, Col, Ptr>>
480{
481 typedef ILUSolver<BuiltinBackend<typename math::scalar_of<Block>::type, Col, Ptr>> Base;
482
483 public:
484
485 using Base::Base;
486};
487
488/*---------------------------------------------------------------------------*/
489/*---------------------------------------------------------------------------*/
490
491} // namespace Arcane::Alina::detail
492
493/*---------------------------------------------------------------------------*/
494/*---------------------------------------------------------------------------*/
495
496#endif
Integer size() const
Nombre d'éléments du vecteur.
NUMA-aware vector container.
Definition NumaVector.h:42
void resize(Int64 s)
Change le nombre d'éléments du tableau à s.
void reserve(Int64 new_capacity)
Réserve le mémoire pour new_capacity éléments.
void push_back(ConstReferenceType val)
Ajoute l'élément val à la fin du tableau.
Informations de base pour la gestion du multi-threading.
Informations d'exécution d'une boucle.
Matrix class, to be used by user.
Vecteur 1D de données avec sémantique par valeur (style STL).
Vector class, to be used by user.
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applique en concurrence la fonction lambda lambda_function sur l'intervalle d'itération donné par loo...
Definition ParallelFor.h:85
std::int32_t Int32
Type entier signé sur 32 bits.
Int32 iters
Number of Jacobi iterations.