270 task(ptrdiff_t beg, ptrdiff_t end)
285 Int32 m_nb_level = 0;
287 template <
class Matrix>
288 sptr_solve(
const Matrix& A,
const value_type* _D = 0)
296 ptrdiff_t n = A.nbRow();
303 ptrdiff_t beg = lower ? 0 : n - 1;
304 ptrdiff_t end = lower ? n : -1;
305 ptrdiff_t inc = lower ? 1 : -1;
307 for (ptrdiff_t i = beg; i != end; i += inc) {
308 ptrdiff_t l = level[i];
310 for (
auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j)
311 l = std::max(l, level[A.col[j]] + 1);
314 nlev = std::max(nlev, l + 1);
321 for (ptrdiff_t i = 0; i < n; ++i)
322 ++start[level[i] + 1];
324 std::partial_sum(start.begin(), start.end(), start.begin());
326 for (ptrdiff_t i = 0; i < n; ++i)
327 order[start[level[i]]++] = i;
329 std::rotate(start.begin(), start.end() - 1, start.end());
339 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
342 for (ptrdiff_t lev = 0; lev < nlev; ++lev) {
344 ptrdiff_t lev_size = start[lev + 1] - start[lev];
345 ptrdiff_t chunk_size = (lev_size + nthreads - 1) / nthreads;
347 ptrdiff_t beg = std::min(tid * chunk_size, lev_size);
348 ptrdiff_t end = std::min(beg + chunk_size, lev_size);
356 thread_rows[tid] += end - beg;
357 for (ptrdiff_t i = beg; i < end; ++i) {
358 ptrdiff_t j = order[i];
359 thread_cols[tid] += A.ptr[j + 1] - A.ptr[j];
369 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
370 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
372 col[tid].
reserve(thread_cols[tid]);
373 val[tid].
reserve(thread_cols[tid]);
374 ord[tid].
reserve(thread_rows[tid]);
375 ptr[tid].
reserve(thread_rows[tid] + 1);
379 D[tid].
reserve(thread_rows[tid]);
381 for (task& t : tasks[tid]) {
382 ptrdiff_t loc_beg = ptr[tid].
size() - 1;
383 ptrdiff_t loc_end = loc_beg;
385 for (ptrdiff_t r = t.beg; r < t.end; ++r, ++loc_end) {
386 ptrdiff_t i = order[r];
392 for (
auto j = A.ptr[i]; j < A.ptr[i + 1]; ++j) {
407 template <
class Vector>
408 void solve(Vector& x)
const
410 const Int32 nb_level = m_nb_level;
411 for (Int32 lev = 0; lev < nb_level; ++lev) {
412 arccoreParallelFor(0, nthreads, ForLoopRunInfo{}, [&](Int32 begin, Int32 size) {
413 for (ptrdiff_t tid = begin; tid < (begin + size); ++tid) {
415 const task& t = tasks[tid][lev];
416 for (ptrdiff_t r = t.beg; r < t.end; ++r) {
417 ptrdiff_t i = ord[tid][r];
418 ptrdiff_t beg = ptr[tid][r];
419 ptrdiff_t end = ptr[tid][r + 1];
421 rhs_type X = math::zero<rhs_type>();
422 for (ptrdiff_t j = beg; j < end; ++j)
423 X += val[tid][j] * x[col[tid][j]];
428 x[i] = D[tid][r] * (x[i] - X);
440 for (
int i = 0; i < nthreads; ++i) {
441 b +=
sizeof(task) * tasks[i].size();
442 b += backend::bytes(ptr[i]);
443 b += backend::bytes(col[i]);
444 b += backend::bytes(val[i]);
445 b += backend::bytes(ord[i]);
448 b += backend::bytes(D[i]);