12#ifndef ARCCORE_ALINA_CUDABACKEND_H
13#define ARCCORE_ALINA_CUDABACKEND_H
29#include "arccore/alina/BuiltinBackend.h"
30#include "arccore/alina/SkylineLUSolver.h"
31#include "arccore/alina/AlinaUtils.h"
33#include <thrust/device_vector.h>
34#include <thrust/fill.h>
35#include <thrust/copy.h>
36#include <thrust/gather.h>
37#include <thrust/scatter.h>
38#include <thrust/for_each.h>
39#include <thrust/inner_product.h>
40#include <cusparse_v2.h>
45namespace Arcane::Alina::solver
52struct cuda_skyline_lu : SkylineLUSolver<T>
54 typedef SkylineLUSolver<T> Base;
56 mutable std::vector<T> _rhs, _x;
58 template <
class Matrix,
class Params>
59 cuda_skyline_lu(
const Matrix& A,
const Params&)
61 , _rhs(backend::nbRow(*A))
62 , _x(backend::nbRow(*A))
65 template <
class Vec1,
class Vec2>
66 void operator()(
const Vec1& rhs, Vec2& x)
const
68 thrust::copy(rhs.begin(), rhs.end(), _rhs.begin());
69 static_cast<const Base*
>(
this)->
operator()(_rhs, _x);
70 thrust::copy(_x.begin(), _x.end(), x.begin());
75 return backend::bytes(*
static_cast<const Base*
>(
this)) +
76 backend::bytes(_rhs) +
83namespace Arcane::Alina::backend::detail
87cuda_check(cusparseStatus_t rc,
const char* file,
int line)
89 if (rc != CUSPARSE_STATUS_SUCCESS) {
90 std::ostringstream msg;
91 msg <<
"CUDA error " << rc <<
" at \"" << file <<
":" << line;
92 precondition(
false, msg.str());
97cuda_check(cudaError_t rc,
const char* file,
int line)
99 if (rc != cudaSuccess) {
100 std::ostringstream msg;
101 msg <<
"CUDA error " << rc <<
" at \"" << file <<
":" << line;
102 precondition(
false, msg.str());
106#define ARCCORE_ALINA_CALL_CUDA(rc) \
107 Arcane::Alina::backend::detail::cuda_check(rc, __FILE__, __LINE__)
111 void operator()(cusparseMatDescr_t handle)
113 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyMatDescr(handle));
116 void operator()(cusparseSpMatDescr_t handle)
118 ARCCORE_ALINA_CALL_CUDA(cusparseDestroySpMat(handle));
121 void operator()(cusparseDnVecDescr_t handle)
123 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyDnVec(handle));
126 void operator()(cudaEvent_t handle)
128 ARCCORE_ALINA_CALL_CUDA(cudaEventDestroy(handle));
131 void operator()(csrilu02Info_t handle)
133 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyCsrilu02Info(handle));
136 void operator()(cusparseSpSVDescr_t handle)
138 ARCCORE_ALINA_CALL_CUDA(cusparseSpSV_destroyDescr(handle));
142template <
typename real>
143cudaDataType cuda_datatype()
145 if (
sizeof(real) ==
sizeof(
float))
151template <
typename real>
152cusparseDnVecDescr_t cuda_vector_description(thrust::device_vector<real>& x)
154 cusparseDnVecDescr_t desc;
155 ARCCORE_ALINA_CALL_CUDA(cusparseCreateDnVec(&desc,
157 thrust::raw_pointer_cast(&x[0]),
158 cuda_datatype<real>()));
162template <
typename real> cusparseDnVecDescr_t
163cuda_vector_description(
const thrust::device_vector<real>&& x)
165 cusparseDnVecDescr_t desc;
166 ARCCORE_ALINA_CALL_CUDA(
167 cusparseCreateDnVec(&desc,
169 thrust::raw_pointer_cast(&x[0]),
170 cuda_datatype<real>()));
174template <
typename real> cusparseSpMatDescr_t
175cuda_matrix_description(
size_t nrows,
178 thrust::device_vector<int>& ptr,
179 thrust::device_vector<int>& col,
180 thrust::device_vector<real>& val)
182 cusparseSpMatDescr_t desc;
183 ARCCORE_ALINA_CALL_CUDA(
184 cusparseCreateCsr(&desc,
188 thrust::raw_pointer_cast(&ptr[0]),
189 thrust::raw_pointer_cast(&col[0]),
190 thrust::raw_pointer_cast(&val[0]),
193 CUSPARSE_INDEX_BASE_ZERO,
194 detail::cuda_datatype<real>()));
202namespace Arcane::Alina::backend
209template <
typename real>
214 typedef real value_type;
216 cuda_matrix(
size_t n,
size_t m,
217 const ptrdiff_t* p_ptr,
218 const ptrdiff_t* p_col,
220 cusparseHandle_t handle)
225 , ptr(p_ptr, p_ptr + n + 1)
226 , col(p_col, p_col + nnz)
227 , val(p_val, p_val + nnz)
229 desc.reset(detail::cuda_matrix_description(nrows, ncols, nnz, ptr, col, val),
233 void spmv(real alpha, thrust::device_vector<real>
const& x,
234 real beta, thrust::device_vector<real>& y)
const
236 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> xdesc(
237 detail::cuda_vector_description(
const_cast<thrust::device_vector<real>&
>(x)),
239 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> ydesc(
240 detail::cuda_vector_description(y),
244 ARCCORE_ALINA_CALL_CUDA(
245 cusparseSpMV_bufferSize(handle,
246 CUSPARSE_OPERATION_NON_TRANSPOSE,
252 detail::cuda_datatype<real>(),
253 CUSPARSE_SPMV_CSR_ALG1,
256 if (buf.size() < buf_size)
257 buf.resize(buf_size);
259 ARCCORE_ALINA_CALL_CUDA(
261 CUSPARSE_OPERATION_NON_TRANSPOSE,
267 detail::cuda_datatype<real>(),
268 CUSPARSE_SPMV_CSR_ALG1,
269 thrust::raw_pointer_cast(&buf[0])));
272 size_t rows()
const {
return nrows; }
273 size_t cols()
const {
return ncols; }
274 size_t nonzeros()
const {
return nnz; }
277 return sizeof(int) * (nrows + 1) +
284 size_t nrows, ncols, nnz;
286 cusparseHandle_t handle;
288 std::shared_ptr<std::remove_pointer<cusparseSpMatDescr_t>::type> desc;
290 thrust::device_vector<int> ptr;
291 thrust::device_vector<int> col;
292 thrust::device_vector<real> val;
294 mutable thrust::device_vector<char> buf;
310template <
typename real,
class DirectSolver = solver::cuda_skyline_lu<real>>
313 static_assert(std::is_same<real, float>::value ||
314 std::is_same<real, double>::value,
315 "Unsupported value type for cuda backend");
317 typedef real value_type;
318 typedef ptrdiff_t col_type;
319 typedef ptrdiff_t ptr_type;
321 typedef thrust::device_vector<real> vector;
322 typedef thrust::device_vector<real> matrix_diagonal;
323 typedef DirectSolver direct_solver;
334 params(cusparseHandle_t handle =
nullptr)
344 void get(
PropertyTree& p,
const std::string& path)
const
350 static std::string name() {
return "cuda"; }
353 static std::shared_ptr<matrix>
356 return std::make_shared<matrix>(backend::nbRow(*A), backend::nbColumn(*A),
361 static std::shared_ptr<vector>
364 return std::make_shared<vector>(x.data(), x.data() + x.size());
368 static std::shared_ptr<vector>
375 static std::shared_ptr<vector>
378 return std::make_shared<vector>(size);
382 static std::shared_ptr<direct_solver>
385 return std::make_shared<direct_solver>(A, prm);
390 thrust::device_vector<ptrdiff_t> I;
391 mutable thrust::device_vector<value_type> T;
393 gather(
size_t src_size,
const std::vector<ptrdiff_t>& I,
const params&)
398 void operator()(
const vector& src, vector& dst)
const
400 thrust::gather(I.begin(), I.end(), src.begin(), dst.begin());
403 void operator()(
const vector& vec, std::vector<value_type>& vals)
const
405 thrust::gather(I.begin(), I.end(), vec.begin(), T.begin());
406 thrust::copy(T.begin(), T.end(), vals.begin());
412 thrust::device_vector<ptrdiff_t> I;
414 scatter(
size_t size,
const std::vector<ptrdiff_t>& I,
const params&)
418 void operator()(
const vector& src, vector& dst)
const
420 thrust::scatter(src.begin(), src.end(), I.begin(), dst.begin());
431 static size_t get(
const thrust::device_vector<V>& v)
433 return v.size() *
sizeof(V);
437template <
typename Alpha,
typename Beta,
typename V>
439 Beta, thrust::device_vector<V>>
442 typedef thrust::device_vector<V> vector;
444 static void apply(Alpha alpha,
const matrix& A,
const vector& x,
445 Beta beta, vector& y)
447 A.spmv(alpha, x, beta, y);
453 thrust::device_vector<V>,
454 thrust::device_vector<V>,
455 thrust::device_vector<V>>
458 typedef thrust::device_vector<V> vector;
460 static void apply(
const vector& rhs,
const matrix& A,
const vector& x,
463 thrust::copy(rhs.begin(), rhs.end(), r.begin());
471 typedef thrust::device_vector<V> vector;
473 static void apply(vector& x)
475 thrust::fill(x.begin(), x.end(), V());
479template <
class V,
class T>
482 static void apply(
const V& x, thrust::device_vector<T>& y)
484 thrust::copy(x.begin(), x.end(), y.begin());
488template <
class T,
class V>
491 static void apply(
const thrust::device_vector<T>& x, V& y)
493 thrust::copy(x.begin(), x.end(), y.begin());
497template <
class T1,
class T2>
498struct copy_impl<thrust::device_vector<T1>, thrust::device_vector<T2>>
500 static void apply(
const thrust::device_vector<T1>& x, thrust::device_vector<T2>& y)
502 thrust::copy(x.begin(), x.end(), y.begin());
508 thrust::device_vector<V>>
510 typedef thrust::device_vector<V> vector;
512 static V get(
const vector& x,
const vector& y)
514 return thrust::inner_product(x.begin(), x.end(), y.begin(), V());
518template <
typename A,
typename B,
typename V>
520 B, thrust::device_vector<V>>
522 typedef thrust::device_vector<V> vector;
533 template <
class Tuple>
534 __host__ __device__
void operator()(Tuple t)
const
539 get<1>(t) = a * get<0>(t) + b * get<1>(t);
541 get<1>(t) = a * get<0>(t);
545 static void apply(A a,
const vector& x, B b, vector& y)
547 thrust::for_each(thrust::make_zip_iterator(
548 thrust::make_tuple(x.begin(), y.begin())),
549 thrust::make_zip_iterator(thrust::make_tuple(x.end(), y.end())),
554template <
typename A,
typename B,
typename C,
typename V>
556 B, thrust::device_vector<V>,
557 C, thrust::device_vector<V>>
559 typedef thrust::device_vector<V> vector;
567 functor(A a, B b, C c)
573 template <
class Tuple>
574 __host__ __device__
void operator()(Tuple t)
const
579 get<2>(t) = a * get<0>(t) + b * get<1>(t) + c * get<2>(t);
581 get<2>(t) = a * get<0>(t) + b * get<1>(t);
585 static void apply(A a,
const vector& x,
586 B b,
const vector& y,
590 thrust::make_zip_iterator(
592 x.begin(), y.begin(), z.begin())),
593 thrust::make_zip_iterator(
595 x.end(), y.end(), z.end())),
600template <
typename A,
typename B,
typename V>
601struct vmul_impl<A, thrust::device_vector<V>, thrust::device_vector<V>,
602 B, thrust::device_vector<V>>
604 typedef thrust::device_vector<V> vector;
615 template <
class Tuple>
616 __host__ __device__
void operator()(Tuple t)
const
621 get<2>(t) = a * get<0>(t) * get<1>(t) + b * get<2>(t);
623 get<2>(t) = a * get<0>(t) * get<1>(t);
627 static void apply(A a,
const vector& x,
const vector& y, B b, vector& z)
629 thrust::for_each(thrust::make_zip_iterator(
631 x.begin(), y.begin(), z.begin())),
632 thrust::make_zip_iterator(
634 x.end(), y.end(), z.end())),
647 float operator-(cuda_event tic)
const
650 cudaEventSynchronize(e.get());
651 cudaEventElapsedTime(&delta, tic.e.get(), e.get());
652 return delta / 1000.0f;
657 std::shared_ptr<std::remove_pointer<cudaEvent_t>::type> e;
659 static cudaEvent_t create_event()
663 cudaEventRecord(e, 0);
672 static const char* units() {
return "s"; }
CUSPARSE matrix in CSR format.
Matrix class, to be used by user.
Implementation for linear combination of two vectors.
Implementation for linear combination of three vectors.
Implementation for function returning number of bytes allocated for a matrix/vector.
Implementation for zeroing out a vector.
Implementation for vector copy.
cusparseHandle_t cusparse_handle
CUSPARSE handle.
static std::shared_ptr< vector > create_vector(size_t size, const params &)
Create vector of the specified size.
static std::shared_ptr< vector > copy_vector(std::shared_ptr< typename BuiltinBackend< real >::vector > x, const params &prm)
Copy vector from builtin backend.
static std::shared_ptr< direct_solver > create_solver(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Create direct solver for coarse level.
static std::shared_ptr< matrix > copy_matrix(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Copy matrix from builtin backend.
static std::shared_ptr< vector > copy_vector(typename BuiltinBackend< real >::vector const &x, const params &)
Copy vector from builtin backend.
Implementation for inner product.
Implementation for residual error compuatation.
Implementation for matrix-vector product.
Implementation for element-wize vector product.