Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
ILUSolverImpl.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* ILUSolverImpl.h (C) 2000-2026 */
9/* */
10/* Solver for obtained as a result of an incomplete LU factorization. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ALINA_ILUSOLVERIMPL_H
13#define ARCCORE_ALINA_ILUSOLVERIMPL_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16/*
17 * This file is based on the work on AMGCL library (version march 2026)
18 * which can be found at https://github.com/ddemidov/amgcl.
19 *
20 * Copyright (c) 2012-2022 Denis Demidov <dennis.demidov@gmail.com>
21 * SPDX-License-Identifier: MIT
22 */
23/*---------------------------------------------------------------------------*/
24/*---------------------------------------------------------------------------*/
25
26#include "arccore/alina/ValueTypeInterface.h"
27#include "arccore/alina/BuiltinBackend.h"
28#include "arccore/alina/HybridBuiltinBackend.h"
29#include "arccore/alina/AlinaUtils.h"
30
31/*---------------------------------------------------------------------------*/
32/*---------------------------------------------------------------------------*/
33
34namespace Arcane::Alina::Impl
35{
36
37/*---------------------------------------------------------------------------*/
38/*---------------------------------------------------------------------------*/
43template <class Backend>
44class ILUSolver
45{
46 public:
47
48 typedef typename Backend::params backend_params;
49 typedef typename Backend::value_type value_type;
50 typedef typename Backend::col_type col_type;
51 typedef typename Backend::ptr_type ptr_type;
52 typedef typename Backend::matrix matrix;
53 typedef typename Backend::vector vector;
54 typedef typename Backend::matrix_diagonal matrix_diagonal;
55 typedef typename BuiltinBackend<value_type, col_type, ptr_type>::matrix build_matrix;
56 typedef typename math::scalar_of<value_type>::type scalar_type;
57
58 struct params
59 {
62
64 double damping = 0.72;
65
66 params() = default;
67
68 params(const PropertyTree& p)
69 : ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, iters)
70 , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, damping)
71 {
72 p.check_params( { "iters", "damping" });
73 }
74
75 void get(PropertyTree& p, const std::string& path) const
76 {
77 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, iters);
78 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, damping);
79 }
80
81 } prm;
82
83 public:
84
85 ILUSolver(std::shared_ptr<build_matrix> L,
86 std::shared_ptr<build_matrix> U,
87 std::shared_ptr<numa_vector<value_type>> D,
88 const params& prm = params(),
89 const backend_params& bprm = backend_params())
90 : prm(prm)
91 , L(Backend::copy_matrix(L, bprm))
92 , U(Backend::copy_matrix(U, bprm))
93 , D(Backend::copy_vector(D, bprm))
94 , t1(Backend::create_vector(backend::nbRow(*L), bprm))
95 , t2(Backend::create_vector(backend::nbRow(*L), bprm))
96 {}
97
98 template <class Vector>
99 void solve(Vector& x)
100 {
101 vector* y0 = t1.get();
102 vector* y1 = t2.get();
103
104 backend::axpby(prm.damping, x, 0.0, *y0);
105 for (unsigned i = 0; i < prm.iters; ++i) {
106 backend::residual(x, *L, *y0, *y1);
107 backend::axpby(prm.damping, *y1, (1 - prm.damping), *y0);
108 }
109
110 backend::vmul(prm.damping, *D, *y0, 0.0, x);
111 for (unsigned i = 0; i < prm.iters; ++i) {
112 backend::residual(*y0, *U, x, *y1);
113 backend::vmul(prm.damping, *D, *y1, (1 - prm.damping), x);
114 }
115 }
116
117 size_t bytes() const
118 {
119 return backend::bytes(*L) +
120 backend::bytes(*U) +
121 backend::bytes(*D) +
122 backend::bytes(*t1) +
123 backend::bytes(*t2);
124 }
125
126 private:
127
128 std::shared_ptr<matrix> L;
129 std::shared_ptr<matrix> U;
130 std::shared_ptr<matrix_diagonal> D;
131 std::shared_ptr<vector> t1, t2;
132};
133
134/*---------------------------------------------------------------------------*/
135/*---------------------------------------------------------------------------*/
142template <class value_type, class col_type, class ptr_type>
143class ILUSolver<BuiltinBackend<value_type, col_type, ptr_type>>
144{
145 public:
146
148 typedef typename Backend::params backend_params;
149 typedef typename Backend::matrix matrix;
150 typedef typename Backend::vector vector;
151 typedef typename Backend::matrix_diagonal matrix_diagonal;
152 typedef typename BuiltinBackend<value_type, col_type, ptr_type>::matrix build_matrix;
153 typedef typename Backend::rhs_type rhs_type;
154 typedef typename math::scalar_of<value_type>::type scalar_type;
155
156 struct params
157 {
159 bool serial;
160
161 params()
162 : serial(ConcurrencyBase::maxAllowedThread() < 4)
163 {}
164
165 params(const PropertyTree& p)
166 : ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, serial)
167 {
168 p.check_params({ "serial" });
169 }
170
171 void get(PropertyTree& p, const std::string& path) const
172 {
173 ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, serial);
174 }
175 } prm;
176
177 ILUSolver(std::shared_ptr<build_matrix> L,
178 std::shared_ptr<build_matrix> U,
179 std::shared_ptr<numa_vector<value_type>> D,
180 const params& prm = params(),
181 const backend_params& = backend_params())
182 : prm(prm)
183 {
184 if (prm.serial)
185 serial_init(L, U, D);
186 else
187 parallel_init(L, U, D);
188 }
189
190 template <class Vector>
191 void solve(Vector& x)
192 {
193 if (prm.serial)
194 serial_solve(x);
195 else
196 parallel_solve(x);
197 }
198
199 size_t bytes() const
200 {
201 size_t b = 0;
202
203 if (L)
204 b += backend::bytes(*L);
205 if (U)
206 b += backend::bytes(*U);
207 if (D)
208 b += backend::bytes(*D);
209
210 if (lower)
211 b += lower->bytes();
212 if (upper)
213 b += upper->bytes();
214
215 return b;
216 }
217
218 private:
219
220 // copies of the input matrices for the fallback (serial)
221 // implementation:
222 std::shared_ptr<matrix> L;
223 std::shared_ptr<matrix> U;
224 std::shared_ptr<matrix_diagonal> D;
225
226 void serial_init(std::shared_ptr<build_matrix> L,
227 std::shared_ptr<build_matrix> U,
228 std::shared_ptr<matrix_diagonal> D)
229 {
230 this->L = L;
231 this->U = U;
232 this->D = D;
233 }
234
235 template <class Vector>
236 void serial_solve(Vector& x)
237 {
238 const size_t n = backend::nbRow(*L);
239
240 const matrix& L = *(this->L);
241 const matrix& U = *(this->U);
242 const matrix_diagonal& D = *(this->D);
243
244 for (size_t i = 0; i < n; i++) {
245 for (ptrdiff_t j = L.ptr[i], e = L.ptr[i + 1]; j < e; ++j)
246 x[i] -= L.val[j] * x[L.col[j]];
247 }
248
249 for (size_t i = n; i-- > 0;) {
250 for (ptrdiff_t j = U.ptr[i], e = U.ptr[i + 1]; j < e; ++j)
251 x[i] -= U.val[j] * x[U.col[j]];
252 x[i] = D[i] * x[i];
253 }
254 }
255
256 // OpenMP solver for sparse triangular systems.
257 // The solver uses level scheduling approach.
258 // Each level (a set of matrix rows that can be computed independently)
259 // is split into tasks, a task per thread, and the matrix data is
260 // distributed across threads to improve cache and NUMA locality.
261 template <bool lower>
262 struct sptr_solve
263 {
264 // a task is a set of rows that can be computed independently by a
265 // single thread.
266 struct task
267 {
268 ptrdiff_t beg, end; // rows to process
269
270 task(ptrdiff_t beg, ptrdiff_t end)
271 : beg(beg)
272 , end(end)
273 {}
274 };
275
276 int nthreads;
277
278 // thread-specific storage:
283 UniqueArray<UniqueArray<ptrdiff_t>> ord; // rows ordered by levels
285 Int32 m_nb_level = 0;
286
287 template <class Matrix>
288 sptr_solve(const Matrix& A, const value_type* _D = 0)
289 : nthreads(ConcurrencyBase::maxAllowedThread())
290 , tasks(nthreads)
291 , ptr(nthreads)
292 , col(nthreads)
293 , val(nthreads)
294 , ord(nthreads)
295 {
296 ptrdiff_t n = A.nbRow();
297 ptrdiff_t nlev = 0;
298
299 UniqueArray<ptrdiff_t> level(n, 0);
300 UniqueArray<ptrdiff_t> order(n, 0);
301
302 // 1. split rows into levels.
303 ptrdiff_t beg = lower ? 0 : n - 1;
304 ptrdiff_t end = lower ? n : -1;
305 ptrdiff_t inc = lower ? 1 : -1;
306
307 for (ptrdiff_t i = beg; i != end; i += inc) {
308 ptrdiff_t l = level[i];
309
310 for (auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j)
311 l = std::max(l, level[A.col[j]] + 1);
312
313 level[i] = l;
314 nlev = std::max(nlev, l + 1);
315 }
316 m_nb_level = nlev;
317
318 // 2. reorder matrix rows.
319 UniqueArray<ptrdiff_t> start(nlev + 1, 0);
320
321 for (ptrdiff_t i = 0; i < n; ++i)
322 ++start[level[i] + 1];
323
324 std::partial_sum(start.begin(), start.end(), start.begin());
325
326 for (ptrdiff_t i = 0; i < n; ++i)
327 order[start[level[i]]++] = i;
328
329 std::rotate(start.begin(), start.end() - 1, start.end());
330 start[0] = 0;
331
332 // 3. Organize matrix rows into tasks.
333 // Each level is split into nthreads tasks.
334 UniqueArray<ptrdiff_t> thread_rows(nthreads, 0);
335 UniqueArray<ptrdiff_t> thread_cols(nthreads, 0);
336
337 // TODO: Use a grain size of 1 to use all threads
338 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
339 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
340 tasks[tid].reserve(nlev);
341
342 for (ptrdiff_t lev = 0; lev < nlev; ++lev) {
343 // split each level into tasks.
344 ptrdiff_t lev_size = start[lev + 1] - start[lev];
345 ptrdiff_t chunk_size = (lev_size + nthreads - 1) / nthreads;
346
347 ptrdiff_t beg = std::min(tid * chunk_size, lev_size);
348 ptrdiff_t end = std::min(beg + chunk_size, lev_size);
349
350 beg += start[lev];
351 end += start[lev];
352
353 tasks[tid].push_back(task(beg, end));
354
355 // count rows and nonzeros in the current task
356 thread_rows[tid] += end - beg;
357 for (ptrdiff_t i = beg; i < end; ++i) {
358 ptrdiff_t j = order[i];
359 thread_cols[tid] += A.ptr[j + 1] - A.ptr[j];
360 }
361 }
362 }
363 });
364
365 // 4. reorganize matrix data for better cache and NUMA locality.
366 if (!lower)
367 D.resize(nthreads);
368
369 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
370 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
371
372 col[tid].reserve(thread_cols[tid]);
373 val[tid].reserve(thread_cols[tid]);
374 ord[tid].reserve(thread_rows[tid]);
375 ptr[tid].reserve(thread_rows[tid] + 1);
376 ptr[tid].push_back(0);
377
378 if (!lower)
379 D[tid].reserve(thread_rows[tid]);
380
381 for (task& t : tasks[tid]) {
382 ptrdiff_t loc_beg = ptr[tid].size() - 1;
383 ptrdiff_t loc_end = loc_beg;
384
385 for (ptrdiff_t r = t.beg; r < t.end; ++r, ++loc_end) {
386 ptrdiff_t i = order[r];
387 if (!lower)
388 D[tid].push_back(_D[i]);
389
390 ord[tid].push_back(i);
391
392 for (auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j) {
393 col[tid].push_back(A.col[j]);
394 val[tid].push_back(A.val[j]);
395 }
396
397 ptr[tid].push_back(col[tid].size());
398 }
399
400 t.beg = loc_beg;
401 t.end = loc_end;
402 }
403 }
404 });
405 }
406
407 template <class Vector>
408 void solve(Vector& x) const
409 {
410 const Int32 nb_level = m_nb_level;
411 for (Int32 lev = 0; lev < nb_level; ++lev) {
412 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
413 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
414 //for (ptrdiff_t tid = 0; tid < nthreads; ++tid) {
415 const task& t = tasks[tid][lev];
416 for (ptrdiff_t r = t.beg; r < t.end; ++r) {
417 ptrdiff_t i = ord[tid][r];
418 ptrdiff_t beg = ptr[tid][r];
419 ptrdiff_t end = ptr[tid][r + 1];
420
421 rhs_type X = math::zero<rhs_type>();
422 for (ptrdiff_t j = beg; j < end; ++j)
423 X += val[tid][j] * x[col[tid][j]];
424
425 if (lower)
426 x[i] -= X;
427 else
428 x[i] = D[tid][r] * (x[i] - X);
429 }
430 }
431 });
432 }
433 }
434
435 size_t
436 bytes() const
437 {
438 size_t b = 0;
439
440 for (int i = 0; i < nthreads; ++i) {
441 b += sizeof(task) * tasks[i].size();
442 b += backend::bytes(ptr[i]);
443 b += backend::bytes(col[i]);
444 b += backend::bytes(val[i]);
445 b += backend::bytes(ord[i]);
446
447 if (!lower)
448 b += backend::bytes(D[i]);
449 }
450
451 return b;
452 }
453 };
454
455 std::shared_ptr<sptr_solve<true>> lower;
456 std::shared_ptr<sptr_solve<false>> upper;
457
458 void parallel_init(std::shared_ptr<build_matrix> L,
459 std::shared_ptr<build_matrix> U,
460 std::shared_ptr<numa_vector<value_type>> D)
461 {
462 lower = std::make_shared<sptr_solve<true>>(*L, D->data());
463 upper = std::make_shared<sptr_solve<false>>(*U, D->data());
464 }
465
466 template <class Vector>
467 void parallel_solve(Vector& x)
468 {
469 lower->solve(x);
470 upper->solve(x);
471 }
472};
473
474/*---------------------------------------------------------------------------*/
475/*---------------------------------------------------------------------------*/
476
477template <class Block, class Col, class Ptr>
478class ILUSolver<backend::HybridBuiltinBackend<Block, Col, Ptr>>
479: public ILUSolver<BuiltinBackend<typename math::scalar_of<Block>::type, Col, Ptr>>
480{
481 typedef ILUSolver<BuiltinBackend<typename math::scalar_of<Block>::type, Col, Ptr>> Base;
482
483 public:
484
485 using Base::Base;
486};
487
488/*---------------------------------------------------------------------------*/
489/*---------------------------------------------------------------------------*/
490
491} // namespace Arcane::Alina::detail
492
493/*---------------------------------------------------------------------------*/
494/*---------------------------------------------------------------------------*/
495
496#endif
Integer size() const
Number of elements in the vector.
NUMA-aware vector container.
Definition NumaVector.h:42
void resize(Int64 s)
Changes the number of elements in the array to s.
void reserve(Int64 new_capacity)
Reserves memory for new_capacity elements.
void push_back(ConstReferenceType val)
Adds the element val to the end of the array.
Basic information for multi-threading management.
Loop execution information.
Matrix class, to be used by user.
1D data vector with value semantics (STL style).
Vector class, to be used by user.
void arccoreParallelFor(const ComplexForLoopRanges< RankValue > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applies the lambda function lambda_function concurrently over the iteration interval given by loop_ra...
Definition ParallelFor.h:87
std::int32_t Int32
Signed integer type of 32 bits.
Int32 iters
Number of Jacobi iterations.