d8/d93/ILUSolverImpl_8h_source.html

// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-

//-----------------------------------------------------------------------------

// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)

// See the top-level COPYRIGHT file for details.

// SPDX-License-Identifier: Apache-2.0

//-----------------------------------------------------------------------------

/*---------------------------------------------------------------------------*/

/* ILUSolverImpl.h                                             (C) 2000-2026 */

/*                                                                           */

/* Solver for obtained as a result of an incomplete LU factorization.        */

/*---------------------------------------------------------------------------*/

#ifndef ARCCORE_ALINA_ILUSOLVERIMPL_H

#define ARCCORE_ALINA_ILUSOLVERIMPL_H

/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

/*

 * This file is based on the work on AMGCL library (version march 2026)

 * which can be found at https://github.com/ddemidov/amgcl.

 *

 * Copyright (c) 2012-2022 Denis Demidov <dennis.demidov@gmail.com>

 * SPDX-License-Identifier: MIT

 */

/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#include "arccore/alina/ValueTypeInterface.h"

#include "arccore/alina/BuiltinBackend.h"

#include "arccore/alina/HybridBuiltinBackend.h"

#include "arccore/alina/AlinaUtils.h"


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


namespace Arcane::Alina::Impl

{


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <class Backend>


class ILUSolver

{

 public:


  typedef typename Backend::params backend_params;

  typedef typename Backend::value_type value_type;

  typedef typename Backend::col_type col_type;

  typedef typename Backend::ptr_type ptr_type;

  typedef typename Backend::matrix matrix;

  typedef typename Backend::vector vector;

  typedef typename Backend::matrix_diagonal matrix_diagonal;

  typedef typename BuiltinBackend<value_type, col_type, ptr_type>::matrix build_matrix;

  typedef typename math::scalar_of<value_type>::type scalar_type;


  struct params

  {

    Int32 iters = 2;


    double damping = 0.72;


    params() = default;


    params(const PropertyTree& p)

    : ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, iters)

    , ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, damping)

    {

      p.check_params( { "iters", "damping" });

    }


    void get(PropertyTree& p, const std::string& path) const

    {

      ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, iters);

      ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, damping);

    }


  } prm;


 public:


  ILUSolver(std::shared_ptr<build_matrix> L,

            std::shared_ptr<build_matrix> U,

            std::shared_ptr<numa_vector<value_type>> D,

            const params& prm = params(),

            const backend_params& bprm = backend_params())

  : prm(prm)

  , L(Backend::copy_matrix(L, bprm))

  , U(Backend::copy_matrix(U, bprm))

  , D(Backend::copy_vector(D, bprm))

  , t1(Backend::create_vector(backend::nbRow(*L), bprm))

  , t2(Backend::create_vector(backend::nbRow(*L), bprm))

  {}


  template <class Vector>

  void solve(Vector& x)

  {

    vector* y0 = t1.get();

    vector* y1 = t2.get();


    backend::axpby(prm.damping, x, 0.0, *y0);

    for (unsigned i = 0; i < prm.iters; ++i) {

      backend::residual(x, *L, *y0, *y1);

      backend::axpby(prm.damping, *y1, (1 - prm.damping), *y0);

    }


    backend::vmul(prm.damping, *D, *y0, 0.0, x);

    for (unsigned i = 0; i < prm.iters; ++i) {

      backend::residual(*y0, *U, x, *y1);

      backend::vmul(prm.damping, *D, *y1, (1 - prm.damping), x);

    }

  }


  size_t bytes() const

  {

    return backend::bytes(*L) +

    backend::bytes(*U) +

    backend::bytes(*D) +

    backend::bytes(*t1) +

    backend::bytes(*t2);

  }


 private:


  std::shared_ptr<matrix> L;

  std::shared_ptr<matrix> U;

  std::shared_ptr<matrix_diagonal> D;

  std::shared_ptr<vector> t1, t2;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/

template <class value_type, class col_type, class ptr_type>


class ILUSolver<BuiltinBackend<value_type, col_type, ptr_type>>

{

 public:


  typedef BuiltinBackend<value_type, col_type, ptr_type> Backend;

  typedef typename Backend::params backend_params;

  typedef typename Backend::matrix matrix;

  typedef typename Backend::vector vector;

  typedef typename Backend::matrix_diagonal matrix_diagonal;

  typedef typename BuiltinBackend<value_type, col_type, ptr_type>::matrix build_matrix;

  typedef typename Backend::rhs_type rhs_type;

  typedef typename math::scalar_of<value_type>::type scalar_type;


  struct params

  {

    bool serial;


    params()

    : serial(ConcurrencyBase::maxAllowedThread() < 4)

    {}


    params(const PropertyTree& p)

    : ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, serial)

    {

      p.check_params({ "serial" });

    }


    void get(PropertyTree& p, const std::string& path) const

    {

      ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, serial);

    }

  } prm;


  ILUSolver(std::shared_ptr<build_matrix> L,

            std::shared_ptr<build_matrix> U,

            std::shared_ptr<numa_vector<value_type>> D,

            const params& prm = params(),

            const backend_params& = backend_params())

  : prm(prm)

  {

    if (prm.serial)

      serial_init(L, U, D);

    else

      parallel_init(L, U, D);

  }


  template <class Vector>

  void solve(Vector& x)

  {

    if (prm.serial)

      serial_solve(x);

    else

      parallel_solve(x);

  }


  size_t bytes() const

  {

    size_t b = 0;


    if (L)

      b += backend::bytes(*L);

    if (U)

      b += backend::bytes(*U);

    if (D)

      b += backend::bytes(*D);


    if (lower)

      b += lower->bytes();

    if (upper)

      b += upper->bytes();


    return b;

  }


 private:


  // copies of the input matrices for the fallback (serial)

  // implementation:

  std::shared_ptr<matrix> L;

  std::shared_ptr<matrix> U;

  std::shared_ptr<matrix_diagonal> D;


  void serial_init(std::shared_ptr<build_matrix> L,

                   std::shared_ptr<build_matrix> U,

                   std::shared_ptr<matrix_diagonal> D)

  {

    this->L = L;

    this->U = U;

    this->D = D;

  }


  template <class Vector>

  void serial_solve(Vector& x)

  {

    const size_t n = backend::nbRow(*L);


    const matrix& L = *(this->L);

    const matrix& U = *(this->U);

    const matrix_diagonal& D = *(this->D);


    for (size_t i = 0; i < n; i++) {

      for (ptrdiff_t j = L.ptr[i], e = L.ptr[i + 1]; j < e; ++j)

        x[i] -= L.val[j] * x[L.col[j]];

    }


    for (size_t i = n; i-- > 0;) {

      for (ptrdiff_t j = U.ptr[i], e = U.ptr[i + 1]; j < e; ++j)

        x[i] -= U.val[j] * x[U.col[j]];

      x[i] = D[i] * x[i];

    }

  }


  // OpenMP solver for sparse triangular systems.

  // The solver uses level scheduling approach.

  // Each level (a set of matrix rows that can be computed independently)

  // is split into tasks, a task per thread, and the matrix data is

  // distributed across threads to improve cache and NUMA locality.

  template <bool lower>


  struct sptr_solve

  {

    // a task is a set of rows that can be computed independently by a

    // single thread.


    struct task

    {

      ptrdiff_t beg, end; // rows to process


      task(ptrdiff_t beg, ptrdiff_t end)

      : beg(beg)

      , end(end)

      {}

    };


    int nthreads;


    // thread-specific storage:

    UniqueArray<UniqueArray<task>> tasks;

    UniqueArray<UniqueArray<ptrdiff_t>> ptr;

    UniqueArray<UniqueArray<ptrdiff_t>> col;

    UniqueArray<UniqueArray<value_type>> val;

    UniqueArray<UniqueArray<ptrdiff_t>> ord; // rows ordered by levels

    UniqueArray<UniqueArray<value_type>> D;

    Int32 m_nb_level = 0;


    template <class Matrix>

    sptr_solve(const Matrix& A, const value_type* _D = 0)

    : nthreads(ConcurrencyBase::maxAllowedThread())

    , tasks(nthreads)

    , ptr(nthreads)

    , col(nthreads)

    , val(nthreads)

    , ord(nthreads)

    {

      ptrdiff_t n = A.nbRow();

      ptrdiff_t nlev = 0;


      UniqueArray<ptrdiff_t> level(n, 0);

      UniqueArray<ptrdiff_t> order(n, 0);


      // 1. split rows into levels.

      ptrdiff_t beg = lower ? 0 : n - 1;

      ptrdiff_t end = lower ? n : -1;

      ptrdiff_t inc = lower ? 1 : -1;


      for (ptrdiff_t i = beg; i != end; i += inc) {

        ptrdiff_t l = level[i];


        for (auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j)

          l = std::max(l, level[A.col[j]] + 1);


        level[i] = l;

        nlev = std::max(nlev, l + 1);

      }

      m_nb_level = nlev;


      // 2. reorder matrix rows.

      UniqueArray<ptrdiff_t> start(nlev + 1, 0);


      for (ptrdiff_t i = 0; i < n; ++i)

        ++start[level[i] + 1];


      std::partial_sum(start.begin(), start.end(), start.begin());


      for (ptrdiff_t i = 0; i < n; ++i)

        order[start[level[i]]++] = i;


      std::rotate(start.begin(), start.end() - 1, start.end());

      start[0] = 0;


      // 3. Organize matrix rows into tasks.

      //    Each level is split into nthreads tasks.

      UniqueArray<ptrdiff_t> thread_rows(nthreads, 0);

      UniqueArray<ptrdiff_t> thread_cols(nthreads, 0);


      // TODO: Use a grain size of 1 to use all threads

      arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {

        for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {

          tasks[tid].reserve(nlev);


          for (ptrdiff_t lev = 0; lev < nlev; ++lev) {

            // split each level into tasks.

            ptrdiff_t lev_size = start[lev + 1] - start[lev];

            ptrdiff_t chunk_size = (lev_size + nthreads - 1) / nthreads;


            ptrdiff_t beg = std::min(tid * chunk_size, lev_size);

            ptrdiff_t end = std::min(beg + chunk_size, lev_size);


            beg += start[lev];

            end += start[lev];


            tasks[tid].push_back(task(beg, end));


            // count rows and nonzeros in the current task

            thread_rows[tid] += end - beg;

            for (ptrdiff_t i = beg; i < end; ++i) {

              ptrdiff_t j = order[i];

              thread_cols[tid] += A.ptr[j + 1] - A.ptr[j];

            }

          }

        }

      });


      // 4. reorganize matrix data for better cache and NUMA locality.

      if (!lower)

        D.resize(nthreads);


      arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {

        for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {


          col[tid].reserve(thread_cols[tid]);

          val[tid].reserve(thread_cols[tid]);

          ord[tid].reserve(thread_rows[tid]);

          ptr[tid].reserve(thread_rows[tid] + 1);

          ptr[tid].push_back(0);


          if (!lower)

            D[tid].reserve(thread_rows[tid]);


          for (task& t : tasks[tid]) {

            ptrdiff_t loc_beg = ptr[tid].size() - 1;

            ptrdiff_t loc_end = loc_beg;


            for (ptrdiff_t r = t.beg; r < t.end; ++r, ++loc_end) {

              ptrdiff_t i = order[r];

              if (!lower)

                D[tid].push_back(_D[i]);


              ord[tid].push_back(i);


              for (auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j) {

                col[tid].push_back(A.col[j]);

                val[tid].push_back(A.val[j]);

              }


              ptr[tid].push_back(col[tid].size());

            }


            t.beg = loc_beg;

            t.end = loc_end;

          }

        }

      });

    }


    template <class Vector>

    void solve(Vector& x) const

    {

      const Int32 nb_level = m_nb_level;

      for (Int32 lev = 0; lev < nb_level; ++lev) {

        arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {

          for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {

            //for (ptrdiff_t tid = 0; tid < nthreads; ++tid) {

            const task& t = tasks[tid][lev];

            for (ptrdiff_t r = t.beg; r < t.end; ++r) {

              ptrdiff_t i = ord[tid][r];

              ptrdiff_t beg = ptr[tid][r];

              ptrdiff_t end = ptr[tid][r + 1];


              rhs_type X = math::zero<rhs_type>();

              for (ptrdiff_t j = beg; j < end; ++j)

                X += val[tid][j] * x[col[tid][j]];


              if (lower)

                x[i] -= X;

              else

                x[i] = D[tid][r] * (x[i] - X);

            }

          }

        });

      }

    }


    size_t

    bytes() const

    {

      size_t b = 0;


      for (int i = 0; i < nthreads; ++i) {

        b += sizeof(task) * tasks[i].size();

        b += backend::bytes(ptr[i]);

        b += backend::bytes(col[i]);

        b += backend::bytes(val[i]);

        b += backend::bytes(ord[i]);


        if (!lower)

          b += backend::bytes(D[i]);

      }


      return b;

    }

  };


  std::shared_ptr<sptr_solve<true>> lower;

  std::shared_ptr<sptr_solve<false>> upper;


  void parallel_init(std::shared_ptr<build_matrix> L,

                     std::shared_ptr<build_matrix> U,

                     std::shared_ptr<numa_vector<value_type>> D)

  {

    lower = std::make_shared<sptr_solve<true>>(*L, D->data());

    upper = std::make_shared<sptr_solve<false>>(*U, D->data());

  }


  template <class Vector>

  void parallel_solve(Vector& x)

  {

    lower->solve(x);

    upper->solve(x);

  }

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


template <class Block, class Col, class Ptr>


class ILUSolver<backend::HybridBuiltinBackend<Block, Col, Ptr>>

: public ILUSolver<BuiltinBackend<typename math::scalar_of<Block>::type, Col, Ptr>>

{

  typedef ILUSolver<BuiltinBackend<typename math::scalar_of<Block>::type, Col, Ptr>> Base;


 public:


  using Base::Base;

};


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


} // namespace Arcane::Alina::detail


/*---------------------------------------------------------------------------*/

/*---------------------------------------------------------------------------*/


#endif

Arcane::AbstractArray::size
Integer size() const
Number of elements in the vector.
Definition AbstractArray.h:303

Arcane::Alina::PropertyTree
Definition AlinaUtils.h:107

Arcane::Alina::numa_vector
NUMA-aware vector container.
Definition NumaVector.h:42

Arcane::Array::resize
void resize(Int64 s)
Changes the number of elements in the array to s.
Definition arccore/src/common/arccore/common/Array.h:250

Arcane::Array::reserve
void reserve(Int64 new_capacity)
Reserves memory for new_capacity elements.
Definition arccore/src/common/arccore/common/Array.h:276

Arcane::Array::push_back
void push_back(ConstReferenceType val)
Adds the element val to the end of the array.
Definition arccore/src/common/arccore/common/Array.h:497

Arcane::ConcurrencyBase
Basic information for multi-threading management.
Definition ConcurrencyBase.h:32

Arcane::ForLoopRunInfo
Loop execution information.
Definition ForLoopRunInfo.h:38

Arcane::Matrix
Matrix class, to be used by user.
Definition matrix/Matrix.h:36

Arcane::UniqueArray
1D data vector with value semantics (STL style).
Definition arccore/src/common/arccore/common/Array.h:890

Arcane::Vector
Vector class, to be used by user.
Definition matrix/Vector.h:36

Arcane::arccoreParallelFor
void arccoreParallelFor(const ComplexForLoopRanges< RankValue, IndexType_ > &loop_ranges, const ForLoopRunInfo &run_info, const LambdaType &lambda_function, const ReducerArgs &... reducer_args)
Applies the lambda function lambda_function concurrently over the iteration interval given by loop_ra...
Definition ParallelFor.h:86

Arcane::Int32
std::int32_t Int32
Signed integer type of 32 bits.
Definition ArccoreGlobal.h:233

Arcane::Alina::BuiltinBackend
Definition BuiltinBackend.h:60

Arcane::Alina::BuiltinBackend< value_type, col_type, ptr_type >::params
Alina::detail::empty_params params
Definition BuiltinBackend.h:77

Arcane::Alina::params

Arcane::Alina::Impl::ILUSolver::params::iters
Int32 iters
Number of Jacobi iterations.
Definition ILUSolverImpl.h:61

Arcane::Alina::Impl::ILUSolver::params::damping
double damping
Damping factor.
Definition ILUSolverImpl.h:64

Arcane::Alina::Impl::ILUSolver< BuiltinBackend< value_type, col_type, ptr_type > >::params
Definition ILUSolverImpl.h:157

Arcane::Alina::Impl::ILUSolver< BuiltinBackend< value_type, col_type, ptr_type > >::params::serial
bool serial
Use serial version of the algorithm.
Definition ILUSolverImpl.h:159