Arcane  v4.1.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaBackend.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaBackend.h (C) 2000-2026 */
9/* */
10/* backend using Cuda runtime. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ALINA_CUDABACKEND_H
13#define ARCCORE_ALINA_CUDABACKEND_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16/*
17 * This file is based on the work on AMGCL library (version march 2026)
18 * which can be found at https://github.com/ddemidov/amgcl.
19 *
20 * Copyright (c) 2012-2022 Denis Demidov <dennis.demidov@gmail.com>
21 * SPDX-License-Identifier: MIT
22 */
23/*---------------------------------------------------------------------------*/
24/*---------------------------------------------------------------------------*/
25
26#include <type_traits>
27#include <memory>
28
29#include "arccore/alina/BuiltinBackend.h"
30#include "arccore/alina/SkylineLUSolver.h"
31#include "arccore/alina/AlinaUtils.h"
32
33#include <thrust/device_vector.h>
34#include <thrust/fill.h>
35#include <thrust/copy.h>
36#include <thrust/gather.h>
37#include <thrust/scatter.h>
38#include <thrust/for_each.h>
39#include <thrust/inner_product.h>
40#include <cusparse_v2.h>
41
42/*---------------------------------------------------------------------------*/
43/*---------------------------------------------------------------------------*/
44
45namespace Arcane::Alina::solver
46{
51template <class T>
52struct cuda_skyline_lu : SkylineLUSolver<T>
53{
54 typedef SkylineLUSolver<T> Base;
55
56 mutable std::vector<T> _rhs, _x;
57
58 template <class Matrix, class Params>
59 cuda_skyline_lu(const Matrix& A, const Params&)
60 : Base(*A)
61 , _rhs(backend::nbRow(*A))
62 , _x(backend::nbRow(*A))
63 {}
64
65 template <class Vec1, class Vec2>
66 void operator()(const Vec1& rhs, Vec2& x) const
67 {
68 thrust::copy(rhs.begin(), rhs.end(), _rhs.begin());
69 static_cast<const Base*>(this)->operator()(_rhs, _x);
70 thrust::copy(_x.begin(), _x.end(), x.begin());
71 }
72
73 size_t bytes() const
74 {
75 return backend::bytes(*static_cast<const Base*>(this)) +
76 backend::bytes(_rhs) +
77 backend::bytes(_x);
78 }
79};
80
81}
82
83namespace Arcane::Alina::backend::detail
84{
85
86inline void
87cuda_check(cusparseStatus_t rc, const char* file, int line)
88{
89 if (rc != CUSPARSE_STATUS_SUCCESS) {
90 std::ostringstream msg;
91 msg << "CUDA error " << rc << " at \"" << file << ":" << line;
92 precondition(false, msg.str());
93 }
94}
95
96inline void
97cuda_check(cudaError_t rc, const char* file, int line)
98{
99 if (rc != cudaSuccess) {
100 std::ostringstream msg;
101 msg << "CUDA error " << rc << " at \"" << file << ":" << line;
102 precondition(false, msg.str());
103 }
104}
105
106#define ARCCORE_ALINA_CALL_CUDA(rc) \
107 Arcane::Alina::backend::detail::cuda_check(rc, __FILE__, __LINE__)
108
110{
111 void operator()(cusparseMatDescr_t handle)
112 {
113 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyMatDescr(handle));
114 }
115
116 void operator()(cusparseSpMatDescr_t handle)
117 {
118 ARCCORE_ALINA_CALL_CUDA(cusparseDestroySpMat(handle));
119 }
120
121 void operator()(cusparseDnVecDescr_t handle)
122 {
123 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyDnVec(handle));
124 }
125
126 void operator()(cudaEvent_t handle)
127 {
128 ARCCORE_ALINA_CALL_CUDA(cudaEventDestroy(handle));
129 }
130
131 void operator()(csrilu02Info_t handle)
132 {
133 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyCsrilu02Info(handle));
134 }
135
136 void operator()(cusparseSpSVDescr_t handle)
137 {
138 ARCCORE_ALINA_CALL_CUDA(cusparseSpSV_destroyDescr(handle));
139 }
140};
141
142template <typename real>
143cudaDataType cuda_datatype()
144{
145 if (sizeof(real) == sizeof(float))
146 return CUDA_R_32F;
147 else
148 return CUDA_R_64F;
149}
150
151template <typename real>
152cusparseDnVecDescr_t cuda_vector_description(thrust::device_vector<real>& x)
153{
154 cusparseDnVecDescr_t desc;
155 ARCCORE_ALINA_CALL_CUDA(cusparseCreateDnVec(&desc,
156 x.size(),
157 thrust::raw_pointer_cast(&x[0]),
158 cuda_datatype<real>()));
159 return desc;
160}
161
162template <typename real> cusparseDnVecDescr_t
163cuda_vector_description(const thrust::device_vector<real>&& x)
164{
165 cusparseDnVecDescr_t desc;
166 ARCCORE_ALINA_CALL_CUDA(
167 cusparseCreateDnVec(&desc,
168 x.size(),
169 thrust::raw_pointer_cast(&x[0]),
170 cuda_datatype<real>()));
171 return desc;
172}
173
174template <typename real> cusparseSpMatDescr_t
175cuda_matrix_description(size_t nrows,
176 size_t ncols,
177 size_t nnz,
178 thrust::device_vector<int>& ptr,
179 thrust::device_vector<int>& col,
180 thrust::device_vector<real>& val)
181{
182 cusparseSpMatDescr_t desc;
183 ARCCORE_ALINA_CALL_CUDA(
184 cusparseCreateCsr(&desc,
185 nrows,
186 ncols,
187 nnz,
188 thrust::raw_pointer_cast(&ptr[0]),
189 thrust::raw_pointer_cast(&col[0]),
190 thrust::raw_pointer_cast(&val[0]),
191 CUSPARSE_INDEX_32I,
192 CUSPARSE_INDEX_32I,
193 CUSPARSE_INDEX_BASE_ZERO,
194 detail::cuda_datatype<real>()));
195 return desc;
196}
197}
198
199/*---------------------------------------------------------------------------*/
200/*---------------------------------------------------------------------------*/
201
202namespace Arcane::Alina::backend
203{
204
205/*---------------------------------------------------------------------------*/
206/*---------------------------------------------------------------------------*/
207
209template <typename real>
210class cuda_matrix
211{
212 public:
213
214 typedef real value_type;
215
216 cuda_matrix(size_t n, size_t m,
217 const ptrdiff_t* p_ptr,
218 const ptrdiff_t* p_col,
219 const real* p_val,
220 cusparseHandle_t handle)
221 : nrows(n)
222 , ncols(m)
223 , nnz(p_ptr[n])
224 , handle(handle)
225 , ptr(p_ptr, p_ptr + n + 1)
226 , col(p_col, p_col + nnz)
227 , val(p_val, p_val + nnz)
228 {
229 desc.reset(detail::cuda_matrix_description(nrows, ncols, nnz, ptr, col, val),
231 }
232
233 void spmv(real alpha, thrust::device_vector<real> const& x,
234 real beta, thrust::device_vector<real>& y) const
235 {
236 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> xdesc(
237 detail::cuda_vector_description(const_cast<thrust::device_vector<real>&>(x)),
239 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> ydesc(
240 detail::cuda_vector_description(y),
242
243 size_t buf_size;
244 ARCCORE_ALINA_CALL_CUDA(
245 cusparseSpMV_bufferSize(handle,
246 CUSPARSE_OPERATION_NON_TRANSPOSE,
247 &alpha,
248 desc.get(),
249 xdesc.get(),
250 &beta,
251 ydesc.get(),
252 detail::cuda_datatype<real>(),
253 CUSPARSE_SPMV_CSR_ALG1,
254 &buf_size));
255
256 if (buf.size() < buf_size)
257 buf.resize(buf_size);
258
259 ARCCORE_ALINA_CALL_CUDA(
260 cusparseSpMV(handle,
261 CUSPARSE_OPERATION_NON_TRANSPOSE,
262 &alpha,
263 desc.get(),
264 xdesc.get(),
265 &beta,
266 ydesc.get(),
267 detail::cuda_datatype<real>(),
268 CUSPARSE_SPMV_CSR_ALG1,
269 thrust::raw_pointer_cast(&buf[0])));
270 }
271
272 size_t rows() const { return nrows; }
273 size_t cols() const { return ncols; }
274 size_t nonzeros() const { return nnz; }
275 size_t bytes() const
276 {
277 return sizeof(int) * (nrows + 1) +
278 sizeof(int) * nnz +
279 sizeof(real) * nnz;
280 }
281
282 public:
283
284 size_t nrows, ncols, nnz;
285
286 cusparseHandle_t handle;
287
288 std::shared_ptr<std::remove_pointer<cusparseSpMatDescr_t>::type> desc;
289
290 thrust::device_vector<int> ptr;
291 thrust::device_vector<int> col;
292 thrust::device_vector<real> val;
293
294 mutable thrust::device_vector<char> buf;
295};
296
297/*---------------------------------------------------------------------------*/
298/*---------------------------------------------------------------------------*/
299
300/*---------------------------------------------------------------------------*/
301/*---------------------------------------------------------------------------*/
302
304
310template <typename real, class DirectSolver = solver::cuda_skyline_lu<real>>
311struct cuda
312{
313 static_assert(std::is_same<real, float>::value ||
314 std::is_same<real, double>::value,
315 "Unsupported value type for cuda backend");
316
317 typedef real value_type;
318 typedef ptrdiff_t col_type;
319 typedef ptrdiff_t ptr_type;
320 typedef cuda_matrix<real> matrix;
321 typedef thrust::device_vector<real> vector;
322 typedef thrust::device_vector<real> matrix_diagonal;
323 typedef DirectSolver direct_solver;
324
325 struct provides_row_iterator : std::false_type
326 {};
327
329 struct params
330 {
332 cusparseHandle_t cusparse_handle = nullptr;
333
334 params(cusparseHandle_t handle = nullptr)
335 : cusparse_handle(handle)
336 {}
337
338 params(const PropertyTree& p)
339 //: ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, cusparse_handle)
340 {
341 //check_params(p, { "cusparse_handle" });
342 }
343
344 void get(PropertyTree& p, const std::string& path) const
345 {
346 //ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, cusparse_handle);
347 }
348 };
349
350 static std::string name() { return "cuda"; }
351
353 static std::shared_ptr<matrix>
354 copy_matrix(std::shared_ptr<typename BuiltinBackend<real>::matrix> A, const params& prm)
355 {
356 return std::make_shared<matrix>(backend::nbRow(*A), backend::nbColumn(*A),
357 A->ptr, A->col, A->val, prm.cusparse_handle);
358 }
359
361 static std::shared_ptr<vector>
362 copy_vector(typename BuiltinBackend<real>::vector const& x, const params&)
363 {
364 return std::make_shared<vector>(x.data(), x.data() + x.size());
365 }
366
368 static std::shared_ptr<vector>
369 copy_vector(std::shared_ptr<typename BuiltinBackend<real>::vector> x, const params& prm)
370 {
371 return copy_vector(*x, prm);
372 }
373
375 static std::shared_ptr<vector>
376 create_vector(size_t size, const params&)
377 {
378 return std::make_shared<vector>(size);
379 }
380
382 static std::shared_ptr<direct_solver>
383 create_solver(std::shared_ptr<typename BuiltinBackend<real>::matrix> A, const params& prm)
384 {
385 return std::make_shared<direct_solver>(A, prm);
386 }
387
388 struct gather
389 {
390 thrust::device_vector<ptrdiff_t> I;
391 mutable thrust::device_vector<value_type> T;
392
393 gather(size_t src_size, const std::vector<ptrdiff_t>& I, const params&)
394 : I(I)
395 , T(I.size())
396 {}
397
398 void operator()(const vector& src, vector& dst) const
399 {
400 thrust::gather(I.begin(), I.end(), src.begin(), dst.begin());
401 }
402
403 void operator()(const vector& vec, std::vector<value_type>& vals) const
404 {
405 thrust::gather(I.begin(), I.end(), vec.begin(), T.begin());
406 thrust::copy(T.begin(), T.end(), vals.begin());
407 }
408 };
409
410 struct scatter
411 {
412 thrust::device_vector<ptrdiff_t> I;
413
414 scatter(size_t size, const std::vector<ptrdiff_t>& I, const params&)
415 : I(I)
416 {}
417
418 void operator()(const vector& src, vector& dst) const
419 {
420 thrust::scatter(src.begin(), src.end(), I.begin(), dst.begin());
421 }
422 };
423};
424
425//---------------------------------------------------------------------------
426// Backend interface implementation
427//---------------------------------------------------------------------------
428template <typename V>
429struct bytes_impl<thrust::device_vector<V>>
430{
431 static size_t get(const thrust::device_vector<V>& v)
432 {
433 return v.size() * sizeof(V);
434 }
435};
436
437template <typename Alpha, typename Beta, typename V>
438struct spmv_impl<Alpha, cuda_matrix<V>, thrust::device_vector<V>,
439 Beta, thrust::device_vector<V>>
440{
441 typedef cuda_matrix<V> matrix;
442 typedef thrust::device_vector<V> vector;
443
444 static void apply(Alpha alpha, const matrix& A, const vector& x,
445 Beta beta, vector& y)
446 {
447 A.spmv(alpha, x, beta, y);
448 }
449};
450
451template <typename V>
453 thrust::device_vector<V>,
454 thrust::device_vector<V>,
455 thrust::device_vector<V>>
456{
457 typedef cuda_matrix<V> matrix;
458 typedef thrust::device_vector<V> vector;
459
460 static void apply(const vector& rhs, const matrix& A, const vector& x,
461 vector& r)
462 {
463 thrust::copy(rhs.begin(), rhs.end(), r.begin());
464 A.spmv(-1, x, 1, r);
465 }
466};
467
468template <typename V>
469struct clear_impl<thrust::device_vector<V>>
470{
471 typedef thrust::device_vector<V> vector;
472
473 static void apply(vector& x)
474 {
475 thrust::fill(x.begin(), x.end(), V());
476 }
477};
478
479template <class V, class T>
480struct copy_impl<V, thrust::device_vector<T>>
481{
482 static void apply(const V& x, thrust::device_vector<T>& y)
483 {
484 thrust::copy(x.begin(), x.end(), y.begin());
485 }
486};
487
488template <class T, class V>
489struct copy_impl<thrust::device_vector<T>, V>
490{
491 static void apply(const thrust::device_vector<T>& x, V& y)
492 {
493 thrust::copy(x.begin(), x.end(), y.begin());
494 }
495};
496
497template <class T1, class T2>
498struct copy_impl<thrust::device_vector<T1>, thrust::device_vector<T2>>
499{
500 static void apply(const thrust::device_vector<T1>& x, thrust::device_vector<T2>& y)
501 {
502 thrust::copy(x.begin(), x.end(), y.begin());
503 }
504};
505
506template <typename V>
507struct inner_product_impl<thrust::device_vector<V>,
508 thrust::device_vector<V>>
509{
510 typedef thrust::device_vector<V> vector;
511
512 static V get(const vector& x, const vector& y)
513 {
514 return thrust::inner_product(x.begin(), x.end(), y.begin(), V());
515 }
516};
517
518template <typename A, typename B, typename V>
519struct axpby_impl<A, thrust::device_vector<V>,
520 B, thrust::device_vector<V>>
521{
522 typedef thrust::device_vector<V> vector;
523
524 struct functor
525 {
526 A a;
527 B b;
528 functor(A a, B b)
529 : a(a)
530 , b(b)
531 {}
532
533 template <class Tuple>
534 __host__ __device__ void operator()(Tuple t) const
535 {
536 using thrust::get;
537
538 if (b)
539 get<1>(t) = a * get<0>(t) + b * get<1>(t);
540 else
541 get<1>(t) = a * get<0>(t);
542 }
543 };
544
545 static void apply(A a, const vector& x, B b, vector& y)
546 {
547 thrust::for_each(thrust::make_zip_iterator(
548 thrust::make_tuple(x.begin(), y.begin())),
549 thrust::make_zip_iterator(thrust::make_tuple(x.end(), y.end())),
550 functor(a, b));
551 }
552};
553
554template <typename A, typename B, typename C, typename V>
555struct axpbypcz_impl<A, thrust::device_vector<V>,
556 B, thrust::device_vector<V>,
557 C, thrust::device_vector<V>>
558{
559 typedef thrust::device_vector<V> vector;
560
561 struct functor
562 {
563 A a;
564 B b;
565 C c;
566
567 functor(A a, B b, C c)
568 : a(a)
569 , b(b)
570 , c(c)
571 {}
572
573 template <class Tuple>
574 __host__ __device__ void operator()(Tuple t) const
575 {
576 using thrust::get;
577
578 if (c)
579 get<2>(t) = a * get<0>(t) + b * get<1>(t) + c * get<2>(t);
580 else
581 get<2>(t) = a * get<0>(t) + b * get<1>(t);
582 }
583 };
584
585 static void apply(A a, const vector& x,
586 B b, const vector& y,
587 C c, vector& z)
588 {
589 thrust::for_each(
590 thrust::make_zip_iterator(
591 thrust::make_tuple(
592 x.begin(), y.begin(), z.begin())),
593 thrust::make_zip_iterator(
594 thrust::make_tuple(
595 x.end(), y.end(), z.end())),
596 functor(a, b, c));
597 }
598};
599
600template <typename A, typename B, typename V>
601struct vmul_impl<A, thrust::device_vector<V>, thrust::device_vector<V>,
602 B, thrust::device_vector<V>>
603{
604 typedef thrust::device_vector<V> vector;
605
606 struct functor
607 {
608 A a;
609 B b;
610 functor(A a, B b)
611 : a(a)
612 , b(b)
613 {}
614
615 template <class Tuple>
616 __host__ __device__ void operator()(Tuple t) const
617 {
618 using thrust::get;
619
620 if (b)
621 get<2>(t) = a * get<0>(t) * get<1>(t) + b * get<2>(t);
622 else
623 get<2>(t) = a * get<0>(t) * get<1>(t);
624 }
625 };
626
627 static void apply(A a, const vector& x, const vector& y, B b, vector& z)
628 {
629 thrust::for_each(thrust::make_zip_iterator(
630 thrust::make_tuple(
631 x.begin(), y.begin(), z.begin())),
632 thrust::make_zip_iterator(
633 thrust::make_tuple(
634 x.end(), y.end(), z.end())),
635 functor(a, b));
636 }
637};
638
639class cuda_event
640{
641 public:
642
643 cuda_event()
644 : e(create_event(), backend::detail::cuda_deleter())
645 {}
646
647 float operator-(cuda_event tic) const
648 {
649 float delta;
650 cudaEventSynchronize(e.get());
651 cudaEventElapsedTime(&delta, tic.e.get(), e.get());
652 return delta / 1000.0f;
653 }
654
655 private:
656
657 std::shared_ptr<std::remove_pointer<cudaEvent_t>::type> e;
658
659 static cudaEvent_t create_event()
660 {
661 cudaEvent_t e;
662 cudaEventCreate(&e);
663 cudaEventRecord(e, 0);
664 return e;
665 }
666};
667
669{
670 typedef cuda_event value_type;
671
672 static const char* units() { return "s"; }
673
674 cuda_event current() const
675 {
676 return cuda_event();
677 }
678};
679
680} // namespace Arcane::Alina::backend
681
682#endif
CUSPARSE matrix in CSR format.
Matrix class, to be used by user.
Implementation for linear combination of two vectors.
Implementation for linear combination of three vectors.
Implementation for function returning number of bytes allocated for a matrix/vector.
Implementation for zeroing out a vector.
Implementation for vector copy.
cusparseHandle_t cusparse_handle
CUSPARSE handle.
static std::shared_ptr< vector > create_vector(size_t size, const params &)
Create vector of the specified size.
static std::shared_ptr< vector > copy_vector(std::shared_ptr< typename BuiltinBackend< real >::vector > x, const params &prm)
Copy vector from builtin backend.
static std::shared_ptr< direct_solver > create_solver(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Create direct solver for coarse level.
static std::shared_ptr< matrix > copy_matrix(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Copy matrix from builtin backend.
static std::shared_ptr< vector > copy_vector(typename BuiltinBackend< real >::vector const &x, const params &)
Copy vector from builtin backend.
Implementation for inner product.
Implementation for residual error compuatation.
Implementation for matrix-vector product.
Implementation for element-wize vector product.