Arcane  4.1.11.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
CudaBackend.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*---------------------------------------------------------------------------*/
8/* CudaBackend.h (C) 2000-2026 */
9/* */
10/* backend using Cuda runtime. */
11/*---------------------------------------------------------------------------*/
12#ifndef ARCCORE_ALINA_CUDABACKEND_H
13#define ARCCORE_ALINA_CUDABACKEND_H
14/*---------------------------------------------------------------------------*/
15/*---------------------------------------------------------------------------*/
16/*
17 * This file is based on the work on AMGCL library (version march 2026)
18 * which can be found at https://github.com/ddemidov/amgcl.
19 *
20 * Copyright (c) 2012-2022 Denis Demidov <dennis.demidov@gmail.com>
21 * SPDX-License-Identifier: MIT
22 */
23/*---------------------------------------------------------------------------*/
24/*---------------------------------------------------------------------------*/
25
26#include <type_traits>
27#include <memory>
28
29#include "arccore/alina/BuiltinBackend.h"
30#include "arccore/alina/SkylineLUSolver.h"
31#include "arccore/alina/AlinaUtils.h"
32
33#include <thrust/device_vector.h>
34#include <thrust/fill.h>
35#include <thrust/copy.h>
36#include <thrust/gather.h>
37#include <thrust/scatter.h>
38#include <thrust/for_each.h>
39#include <thrust/inner_product.h>
40#include <cusparse_v2.h>
41
42/*---------------------------------------------------------------------------*/
43/*---------------------------------------------------------------------------*/
44
45namespace Arcane::Alina::solver
46{
51template <class T>
52struct cuda_skyline_lu : SkylineLUSolver<T>
53{
54 typedef SkylineLUSolver<T> Base;
55
56 mutable std::vector<T> _rhs, _x;
57
58 template <class Matrix, class Params>
59 cuda_skyline_lu(const Matrix& A, const Params&)
60 : Base(*A)
61 , _rhs(backend::nbRow(*A))
62 , _x(backend::nbRow(*A))
63 {}
64
65 template <class Vec1, class Vec2>
66 void operator()(const Vec1& rhs, Vec2& x) const
67 {
68 thrust::copy(rhs.begin(), rhs.end(), _rhs.begin());
69 static_cast<const Base*>(this)->operator()(_rhs, _x);
70 thrust::copy(_x.begin(), _x.end(), x.begin());
71 }
72
73 size_t bytes() const
74 {
75 return backend::bytes(*static_cast<const Base*>(this)) +
76 backend::bytes(_rhs) +
77 backend::bytes(_x);
78 }
79};
80
81}
82
83namespace Arcane::Alina::backend::detail
84{
85
86inline void
87cuda_check(cusparseStatus_t rc, const char* file, int line)
88{
89 if (rc != CUSPARSE_STATUS_SUCCESS) {
90 std::ostringstream msg;
91 msg << "CUDA error " << rc << " at \"" << file << ":" << line;
92 precondition(false, msg.str());
93 }
94}
95
96inline void
97cuda_check(cudaError_t rc, const char* file, int line)
98{
99 if (rc != cudaSuccess) {
100 std::ostringstream msg;
101 msg << "CUDA error " << rc << " at \"" << file << ":" << line;
102 precondition(false, msg.str());
103 }
104}
105
106#define ARCCORE_ALINA_CALL_CUDA(rc) \
107 Arcane::Alina::backend::detail::cuda_check(rc, __FILE__, __LINE__)
108
110{
111 void operator()(cusparseMatDescr_t handle)
112 {
113 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyMatDescr(handle));
114 }
115
116 void operator()(cusparseSpMatDescr_t handle)
117 {
118 ARCCORE_ALINA_CALL_CUDA(cusparseDestroySpMat(handle));
119 }
120
121 void operator()(cusparseDnVecDescr_t handle)
122 {
123 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyDnVec(handle));
124 }
125
126 void operator()(cudaEvent_t handle)
127 {
128 ARCCORE_ALINA_CALL_CUDA(cudaEventDestroy(handle));
129 }
130
131 void operator()(csrilu02Info_t handle)
132 {
133 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyCsrilu02Info(handle));
134 }
135
136 void operator()(cusparseSpSVDescr_t handle)
137 {
138 ARCCORE_ALINA_CALL_CUDA(cusparseSpSV_destroyDescr(handle));
139 }
140};
141
142template <typename real>
143cudaDataType cuda_datatype()
144{
145 if (sizeof(real) == sizeof(float))
146 return CUDA_R_32F;
147 else
148 return CUDA_R_64F;
149}
150
151template <typename real>
152cusparseDnVecDescr_t cuda_vector_description(thrust::device_vector<real>& x)
153{
154 cusparseDnVecDescr_t desc;
155 ARCCORE_ALINA_CALL_CUDA(cusparseCreateDnVec(&desc,
156 x.size(),
157 thrust::raw_pointer_cast(&x[0]),
158 cuda_datatype<real>()));
159 return desc;
160}
161
162template <typename real> cusparseDnVecDescr_t
163cuda_vector_description(const thrust::device_vector<real>&& x)
164{
165 cusparseDnVecDescr_t desc;
166 ARCCORE_ALINA_CALL_CUDA(
167 cusparseCreateDnVec(&desc,
168 x.size(),
169 thrust::raw_pointer_cast(&x[0]),
170 cuda_datatype<real>()));
171 return desc;
172}
173
174template <typename real> cusparseSpMatDescr_t
175cuda_matrix_description(size_t nrows,
176 size_t ncols,
177 size_t nnz,
178 thrust::device_vector<int>& ptr,
179 thrust::device_vector<int>& col,
180 thrust::device_vector<real>& val)
181{
182 cusparseSpMatDescr_t desc;
183 ARCCORE_ALINA_CALL_CUDA(
184 cusparseCreateCsr(&desc,
185 nrows,
186 ncols,
187 nnz,
188 thrust::raw_pointer_cast(&ptr[0]),
189 thrust::raw_pointer_cast(&col[0]),
190 thrust::raw_pointer_cast(&val[0]),
191 CUSPARSE_INDEX_32I,
192 CUSPARSE_INDEX_32I,
193 CUSPARSE_INDEX_BASE_ZERO,
194 detail::cuda_datatype<real>()));
195 return desc;
196}
197}
198
199/*---------------------------------------------------------------------------*/
200/*---------------------------------------------------------------------------*/
201
202namespace Arcane::Alina::backend
203{
204
205/*---------------------------------------------------------------------------*/
206/*---------------------------------------------------------------------------*/
207
209template <typename real>
210class cuda_matrix
211{
212 public:
213
214 typedef real value_type;
215
216 cuda_matrix(size_t n, size_t m,
217 const ptrdiff_t* p_ptr,
218 const ptrdiff_t* p_col,
219 const real* p_val,
220 cusparseHandle_t handle)
221 : nrows(n)
222 , ncols(m)
223 , nnz(p_ptr[n])
224 , handle(handle)
225 , ptr(p_ptr, p_ptr + n + 1)
226 , col(p_col, p_col + nnz)
227 , val(p_val, p_val + nnz)
228 {
229 desc.reset(detail::cuda_matrix_description(nrows, ncols, nnz, ptr, col, val),
231 }
232
233 void spmv(real alpha, thrust::device_vector<real> const& x,
234 real beta, thrust::device_vector<real>& y) const
235 {
236 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> xdesc(
237 detail::cuda_vector_description(const_cast<thrust::device_vector<real>&>(x)),
239 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> ydesc(
240 detail::cuda_vector_description(y),
242
243 size_t buf_size;
244 ARCCORE_ALINA_CALL_CUDA(
245 cusparseSpMV_bufferSize(handle,
246 CUSPARSE_OPERATION_NON_TRANSPOSE,
247 &alpha,
248 desc.get(),
249 xdesc.get(),
250 &beta,
251 ydesc.get(),
252 detail::cuda_datatype<real>(),
253 CUSPARSE_SPMV_CSR_ALG1,
254 &buf_size));
255
256 if (buf.size() < buf_size)
257 buf.resize(buf_size);
258
259 ARCCORE_ALINA_CALL_CUDA(
260 cusparseSpMV(handle,
261 CUSPARSE_OPERATION_NON_TRANSPOSE,
262 &alpha,
263 desc.get(),
264 xdesc.get(),
265 &beta,
266 ydesc.get(),
267 detail::cuda_datatype<real>(),
268 CUSPARSE_SPMV_CSR_ALG1,
269 thrust::raw_pointer_cast(&buf[0])));
270 }
271
272 size_t nbRow() const { return nrows; }
273 size_t nbColumn() const { return ncols; }
274 size_t nbNonZero() const { return nnz; }
275
276 size_t rows() const { return nrows; }
277 size_t cols() const { return ncols; }
278 size_t nonzeros() const { return nnz; }
279 size_t bytes() const
280 {
281 return sizeof(int) * (nrows + 1) +
282 sizeof(int) * nnz +
283 sizeof(real) * nnz;
284 }
285
286 public:
287
288 size_t nrows, ncols, nnz;
289
290 cusparseHandle_t handle;
291
292 std::shared_ptr<std::remove_pointer<cusparseSpMatDescr_t>::type> desc;
293
294 thrust::device_vector<int> ptr;
295 thrust::device_vector<int> col;
296 thrust::device_vector<real> val;
297
298 mutable thrust::device_vector<char> buf;
299};
300
301/*---------------------------------------------------------------------------*/
302/*---------------------------------------------------------------------------*/
303
304/*---------------------------------------------------------------------------*/
305/*---------------------------------------------------------------------------*/
306
308
314template <typename real, class DirectSolver = solver::cuda_skyline_lu<real>>
315struct cuda
316{
317 static_assert(std::is_same<real, float>::value ||
318 std::is_same<real, double>::value,
319 "Unsupported value type for cuda backend");
320
321 typedef real value_type;
322 typedef ptrdiff_t col_type;
323 typedef ptrdiff_t ptr_type;
324 typedef cuda_matrix<real> matrix;
325 typedef thrust::device_vector<real> vector;
326 typedef thrust::device_vector<real> matrix_diagonal;
327 typedef DirectSolver direct_solver;
328
329 struct provides_row_iterator : std::false_type
330 {};
331
333 struct params
334 {
336 cusparseHandle_t cusparse_handle = nullptr;
337
338 params(cusparseHandle_t handle = nullptr)
339 : cusparse_handle(handle)
340 {}
341
342 params(const PropertyTree& p)
343 //: ARCCORE_ALINA_PARAMS_IMPORT_VALUE(p, cusparse_handle)
344 {
345 //check_params(p, { "cusparse_handle" });
346 }
347
348 void get(PropertyTree& p, const std::string& path) const
349 {
350 //ARCCORE_ALINA_PARAMS_EXPORT_VALUE(p, path, cusparse_handle);
351 }
352 };
353
354 static std::string name() { return "cuda"; }
355
357 static std::shared_ptr<matrix>
358 copy_matrix(std::shared_ptr<typename BuiltinBackend<real>::matrix> A, const params& prm)
359 {
360 return std::make_shared<matrix>(backend::nbRow(*A), backend::nbColumn(*A),
361 A->ptr, A->col, A->val, prm.cusparse_handle);
362 }
363
365 static std::shared_ptr<vector>
366 copy_vector(typename BuiltinBackend<real>::vector const& x, const params&)
367 {
368 return std::make_shared<vector>(x.data(), x.data() + x.size());
369 }
370
372 static std::shared_ptr<vector>
373 copy_vector(std::shared_ptr<typename BuiltinBackend<real>::vector> x, const params& prm)
374 {
375 return copy_vector(*x, prm);
376 }
377
379 static std::shared_ptr<vector>
380 create_vector(size_t size, const params&)
381 {
382 return std::make_shared<vector>(size);
383 }
384
386 static std::shared_ptr<direct_solver>
387 create_solver(std::shared_ptr<typename BuiltinBackend<real>::matrix> A, const params& prm)
388 {
389 return std::make_shared<direct_solver>(A, prm);
390 }
391
392 struct gather
393 {
394 thrust::device_vector<ptrdiff_t> I;
395 mutable thrust::device_vector<value_type> T;
396
397 gather(size_t src_size, const std::vector<ptrdiff_t>& I, const params&)
398 : I(I)
399 , T(I.size())
400 {}
401
402 void operator()(const vector& src, vector& dst) const
403 {
404 thrust::gather(I.begin(), I.end(), src.begin(), dst.begin());
405 }
406
407 void operator()(const vector& vec, std::vector<value_type>& vals) const
408 {
409 thrust::gather(I.begin(), I.end(), vec.begin(), T.begin());
410 thrust::copy(T.begin(), T.end(), vals.begin());
411 }
412 };
413
414 struct scatter
415 {
416 thrust::device_vector<ptrdiff_t> I;
417
418 scatter(size_t size, const std::vector<ptrdiff_t>& I, const params&)
419 : I(I)
420 {}
421
422 void operator()(const vector& src, vector& dst) const
423 {
424 thrust::scatter(src.begin(), src.end(), I.begin(), dst.begin());
425 }
426 };
427};
428
429//---------------------------------------------------------------------------
430// Backend interface implementation
431//---------------------------------------------------------------------------
432template <typename V>
433struct bytes_impl<thrust::device_vector<V>>
434{
435 static size_t get(const thrust::device_vector<V>& v)
436 {
437 return v.size() * sizeof(V);
438 }
439};
440
441template <typename Alpha, typename Beta, typename V>
442struct spmv_impl<Alpha, cuda_matrix<V>, thrust::device_vector<V>,
443 Beta, thrust::device_vector<V>>
444{
445 typedef cuda_matrix<V> matrix;
446 typedef thrust::device_vector<V> vector;
447
448 static void apply(Alpha alpha, const matrix& A, const vector& x,
449 Beta beta, vector& y)
450 {
451 A.spmv(alpha, x, beta, y);
452 }
453};
454
455template <typename V>
457 thrust::device_vector<V>,
458 thrust::device_vector<V>,
459 thrust::device_vector<V>>
460{
461 typedef cuda_matrix<V> matrix;
462 typedef thrust::device_vector<V> vector;
463
464 static void apply(const vector& rhs, const matrix& A, const vector& x,
465 vector& r)
466 {
467 thrust::copy(rhs.begin(), rhs.end(), r.begin());
468 A.spmv(-1, x, 1, r);
469 }
470};
471
472template <typename V>
473struct clear_impl<thrust::device_vector<V>>
474{
475 typedef thrust::device_vector<V> vector;
476
477 static void apply(vector& x)
478 {
479 thrust::fill(x.begin(), x.end(), V());
480 }
481};
482
483template <class V, class T>
484struct copy_impl<V, thrust::device_vector<T>>
485{
486 static void apply(const V& x, thrust::device_vector<T>& y)
487 {
488 thrust::copy(x.begin(), x.end(), y.begin());
489 }
490};
491
492template <class T, class V>
493struct copy_impl<thrust::device_vector<T>, V>
494{
495 static void apply(const thrust::device_vector<T>& x, V& y)
496 {
497 thrust::copy(x.begin(), x.end(), y.begin());
498 }
499};
500
501template <class T1, class T2>
502struct copy_impl<thrust::device_vector<T1>, thrust::device_vector<T2>>
503{
504 static void apply(const thrust::device_vector<T1>& x, thrust::device_vector<T2>& y)
505 {
506 thrust::copy(x.begin(), x.end(), y.begin());
507 }
508};
509
510template <typename V>
511struct inner_product_impl<thrust::device_vector<V>,
512 thrust::device_vector<V>>
513{
514 typedef thrust::device_vector<V> vector;
515
516 static V get(const vector& x, const vector& y)
517 {
518 return thrust::inner_product(x.begin(), x.end(), y.begin(), V());
519 }
520};
521
522template <typename A, typename B, typename V>
523struct axpby_impl<A, thrust::device_vector<V>,
524 B, thrust::device_vector<V>>
525{
526 typedef thrust::device_vector<V> vector;
527
528 struct functor
529 {
530 A a;
531 B b;
532 functor(A a, B b)
533 : a(a)
534 , b(b)
535 {}
536
537 template <class Tuple>
538 __host__ __device__ void operator()(Tuple t) const
539 {
540 using thrust::get;
541
542 if (b)
543 get<1>(t) = a * get<0>(t) + b * get<1>(t);
544 else
545 get<1>(t) = a * get<0>(t);
546 }
547 };
548
549 static void apply(A a, const vector& x, B b, vector& y)
550 {
551 thrust::for_each(thrust::make_zip_iterator(
552 thrust::make_tuple(x.begin(), y.begin())),
553 thrust::make_zip_iterator(thrust::make_tuple(x.end(), y.end())),
554 functor(a, b));
555 }
556};
557
558template <typename A, typename B, typename C, typename V>
559struct axpbypcz_impl<A, thrust::device_vector<V>,
560 B, thrust::device_vector<V>,
561 C, thrust::device_vector<V>>
562{
563 typedef thrust::device_vector<V> vector;
564
565 struct functor
566 {
567 A a;
568 B b;
569 C c;
570
571 functor(A a, B b, C c)
572 : a(a)
573 , b(b)
574 , c(c)
575 {}
576
577 template <class Tuple>
578 __host__ __device__ void operator()(Tuple t) const
579 {
580 using thrust::get;
581
582 if (c)
583 get<2>(t) = a * get<0>(t) + b * get<1>(t) + c * get<2>(t);
584 else
585 get<2>(t) = a * get<0>(t) + b * get<1>(t);
586 }
587 };
588
589 static void apply(A a, const vector& x,
590 B b, const vector& y,
591 C c, vector& z)
592 {
593 thrust::for_each(
594 thrust::make_zip_iterator(
595 thrust::make_tuple(
596 x.begin(), y.begin(), z.begin())),
597 thrust::make_zip_iterator(
598 thrust::make_tuple(
599 x.end(), y.end(), z.end())),
600 functor(a, b, c));
601 }
602};
603
604template <typename A, typename B, typename V>
605struct vmul_impl<A, thrust::device_vector<V>, thrust::device_vector<V>,
606 B, thrust::device_vector<V>>
607{
608 typedef thrust::device_vector<V> vector;
609
610 struct functor
611 {
612 A a;
613 B b;
614 functor(A a, B b)
615 : a(a)
616 , b(b)
617 {}
618
619 template <class Tuple>
620 __host__ __device__ void operator()(Tuple t) const
621 {
622 using thrust::get;
623
624 if (b)
625 get<2>(t) = a * get<0>(t) * get<1>(t) + b * get<2>(t);
626 else
627 get<2>(t) = a * get<0>(t) * get<1>(t);
628 }
629 };
630
631 static void apply(A a, const vector& x, const vector& y, B b, vector& z)
632 {
633 thrust::for_each(thrust::make_zip_iterator(
634 thrust::make_tuple(
635 x.begin(), y.begin(), z.begin())),
636 thrust::make_zip_iterator(
637 thrust::make_tuple(
638 x.end(), y.end(), z.end())),
639 functor(a, b));
640 }
641};
642
643class cuda_event
644{
645 public:
646
647 cuda_event()
648 : e(create_event(), backend::detail::cuda_deleter())
649 {}
650
651 float operator-(cuda_event tic) const
652 {
653 float delta;
654 cudaEventSynchronize(e.get());
655 cudaEventElapsedTime(&delta, tic.e.get(), e.get());
656 return delta / 1000.0f;
657 }
658
659 private:
660
661 std::shared_ptr<std::remove_pointer<cudaEvent_t>::type> e;
662
663 static cudaEvent_t create_event()
664 {
665 cudaEvent_t e;
666 cudaEventCreate(&e);
667 cudaEventRecord(e, 0);
668 return e;
669 }
670};
671
673{
674 typedef cuda_event value_type;
675
676 static const char* units() { return "s"; }
677
678 cuda_event current() const
679 {
680 return cuda_event();
681 }
682};
683
684} // namespace Arcane::Alina::backend
685
686#endif
CUSPARSE matrix in CSR format.
Matrix class, to be used by user.
Implementation for linear combination of two vectors.
Implementation for linear combination of three vectors.
Implementation for function returning number of bytes allocated for a matrix/vector.
Implementation for zeroing out a vector.
Implementation for vector copy.
cusparseHandle_t cusparse_handle
CUSPARSE handle.
static std::shared_ptr< vector > create_vector(size_t size, const params &)
Create vector of the specified size.
static std::shared_ptr< vector > copy_vector(std::shared_ptr< typename BuiltinBackend< real >::vector > x, const params &prm)
Copy vector from builtin backend.
static std::shared_ptr< direct_solver > create_solver(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Create direct solver for coarse level.
static std::shared_ptr< matrix > copy_matrix(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Copy matrix from builtin backend.
static std::shared_ptr< vector > copy_vector(typename BuiltinBackend< real >::vector const &x, const params &)
Copy vector from builtin backend.
Implementation for inner product.
Implementation for residual error compuatation.
Implementation for matrix-vector product.
Implementation for element-wize vector product.