12#ifndef ARCCORE_ALINA_CUDABACKEND_H
13#define ARCCORE_ALINA_CUDABACKEND_H
29#include "arccore/alina/BuiltinBackend.h"
30#include "arccore/alina/SkylineLUSolver.h"
31#include "arccore/alina/AlinaUtils.h"
33#include <thrust/device_vector.h>
34#include <thrust/fill.h>
35#include <thrust/copy.h>
36#include <thrust/gather.h>
37#include <thrust/scatter.h>
38#include <thrust/for_each.h>
39#include <thrust/inner_product.h>
40#include <cusparse_v2.h>
45namespace Arcane::Alina::solver
52struct cuda_skyline_lu : SkylineLUSolver<T>
54 typedef SkylineLUSolver<T> Base;
56 mutable std::vector<T> _rhs, _x;
58 template <
class Matrix,
class Params>
59 cuda_skyline_lu(
const Matrix& A,
const Params&)
61 , _rhs(backend::nbRow(*A))
62 , _x(backend::nbRow(*A))
65 template <
class Vec1,
class Vec2>
66 void operator()(
const Vec1& rhs, Vec2& x)
const
68 thrust::copy(rhs.begin(), rhs.end(), _rhs.begin());
69 static_cast<const Base*
>(
this)->
operator()(_rhs, _x);
70 thrust::copy(_x.begin(), _x.end(), x.begin());
75 return backend::bytes(*
static_cast<const Base*
>(
this)) +
76 backend::bytes(_rhs) +
83namespace Arcane::Alina::backend::detail
87cuda_check(cusparseStatus_t rc,
const char* file,
int line)
89 if (rc != CUSPARSE_STATUS_SUCCESS) {
90 std::ostringstream msg;
91 msg <<
"CUDA error " << rc <<
" at \"" << file <<
":" << line;
92 precondition(
false, msg.str());
97cuda_check(cudaError_t rc,
const char* file,
int line)
99 if (rc != cudaSuccess) {
100 std::ostringstream msg;
101 msg <<
"CUDA error " << rc <<
" at \"" << file <<
":" << line;
102 precondition(
false, msg.str());
106#define ARCCORE_ALINA_CALL_CUDA(rc) \
107 Arcane::Alina::backend::detail::cuda_check(rc, __FILE__, __LINE__)
111 void operator()(cusparseMatDescr_t handle)
113 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyMatDescr(handle));
116 void operator()(cusparseSpMatDescr_t handle)
118 ARCCORE_ALINA_CALL_CUDA(cusparseDestroySpMat(handle));
121 void operator()(cusparseDnVecDescr_t handle)
123 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyDnVec(handle));
126 void operator()(cudaEvent_t handle)
128 ARCCORE_ALINA_CALL_CUDA(cudaEventDestroy(handle));
131 void operator()(csrilu02Info_t handle)
133 ARCCORE_ALINA_CALL_CUDA(cusparseDestroyCsrilu02Info(handle));
136 void operator()(cusparseSpSVDescr_t handle)
138 ARCCORE_ALINA_CALL_CUDA(cusparseSpSV_destroyDescr(handle));
142template <
typename real>
143cudaDataType cuda_datatype()
145 if (
sizeof(real) ==
sizeof(
float))
151template <
typename real>
152cusparseDnVecDescr_t cuda_vector_description(thrust::device_vector<real>& x)
154 cusparseDnVecDescr_t desc;
155 ARCCORE_ALINA_CALL_CUDA(cusparseCreateDnVec(&desc,
157 thrust::raw_pointer_cast(&x[0]),
158 cuda_datatype<real>()));
162template <
typename real> cusparseDnVecDescr_t
163cuda_vector_description(
const thrust::device_vector<real>&& x)
165 cusparseDnVecDescr_t desc;
166 ARCCORE_ALINA_CALL_CUDA(
167 cusparseCreateDnVec(&desc,
169 thrust::raw_pointer_cast(&x[0]),
170 cuda_datatype<real>()));
174template <
typename real> cusparseSpMatDescr_t
175cuda_matrix_description(
size_t nrows,
178 thrust::device_vector<int>& ptr,
179 thrust::device_vector<int>& col,
180 thrust::device_vector<real>& val)
182 cusparseSpMatDescr_t desc;
183 ARCCORE_ALINA_CALL_CUDA(
184 cusparseCreateCsr(&desc,
188 thrust::raw_pointer_cast(&ptr[0]),
189 thrust::raw_pointer_cast(&col[0]),
190 thrust::raw_pointer_cast(&val[0]),
193 CUSPARSE_INDEX_BASE_ZERO,
194 detail::cuda_datatype<real>()));
202namespace Arcane::Alina::backend
209template <
typename real>
214 typedef real value_type;
216 cuda_matrix(
size_t n,
size_t m,
217 const ptrdiff_t* p_ptr,
218 const ptrdiff_t* p_col,
220 cusparseHandle_t handle)
225 , ptr(p_ptr, p_ptr + n + 1)
226 , col(p_col, p_col + nnz)
227 , val(p_val, p_val + nnz)
229 desc.reset(detail::cuda_matrix_description(nrows, ncols, nnz, ptr, col, val),
233 void spmv(real alpha, thrust::device_vector<real>
const& x,
234 real beta, thrust::device_vector<real>& y)
const
236 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> xdesc(
237 detail::cuda_vector_description(
const_cast<thrust::device_vector<real>&
>(x)),
239 std::shared_ptr<std::remove_pointer<cusparseDnVecDescr_t>::type> ydesc(
240 detail::cuda_vector_description(y),
244 ARCCORE_ALINA_CALL_CUDA(
245 cusparseSpMV_bufferSize(handle,
246 CUSPARSE_OPERATION_NON_TRANSPOSE,
252 detail::cuda_datatype<real>(),
253 CUSPARSE_SPMV_CSR_ALG1,
256 if (buf.size() < buf_size)
257 buf.resize(buf_size);
259 ARCCORE_ALINA_CALL_CUDA(
261 CUSPARSE_OPERATION_NON_TRANSPOSE,
267 detail::cuda_datatype<real>(),
268 CUSPARSE_SPMV_CSR_ALG1,
269 thrust::raw_pointer_cast(&buf[0])));
272 size_t nbRow()
const {
return nrows; }
273 size_t nbColumn()
const {
return ncols; }
274 size_t nbNonZero()
const {
return nnz; }
276 size_t rows()
const {
return nrows; }
277 size_t cols()
const {
return ncols; }
278 size_t nonzeros()
const {
return nnz; }
281 return sizeof(int) * (nrows + 1) +
288 size_t nrows, ncols, nnz;
290 cusparseHandle_t handle;
292 std::shared_ptr<std::remove_pointer<cusparseSpMatDescr_t>::type> desc;
294 thrust::device_vector<int> ptr;
295 thrust::device_vector<int> col;
296 thrust::device_vector<real> val;
298 mutable thrust::device_vector<char> buf;
314template <
typename real,
class DirectSolver = solver::cuda_skyline_lu<real>>
317 static_assert(std::is_same<real, float>::value ||
318 std::is_same<real, double>::value,
319 "Unsupported value type for cuda backend");
321 typedef real value_type;
322 typedef ptrdiff_t col_type;
323 typedef ptrdiff_t ptr_type;
325 typedef thrust::device_vector<real> vector;
326 typedef thrust::device_vector<real> matrix_diagonal;
327 typedef DirectSolver direct_solver;
338 params(cusparseHandle_t handle =
nullptr)
348 void get(
PropertyTree& p,
const std::string& path)
const
354 static std::string name() {
return "cuda"; }
357 static std::shared_ptr<matrix>
360 return std::make_shared<matrix>(backend::nbRow(*A), backend::nbColumn(*A),
365 static std::shared_ptr<vector>
368 return std::make_shared<vector>(x.data(), x.data() + x.size());
372 static std::shared_ptr<vector>
379 static std::shared_ptr<vector>
382 return std::make_shared<vector>(size);
386 static std::shared_ptr<direct_solver>
389 return std::make_shared<direct_solver>(A, prm);
394 thrust::device_vector<ptrdiff_t> I;
395 mutable thrust::device_vector<value_type> T;
397 gather(
size_t src_size,
const std::vector<ptrdiff_t>& I,
const params&)
402 void operator()(
const vector& src, vector& dst)
const
404 thrust::gather(I.begin(), I.end(), src.begin(), dst.begin());
407 void operator()(
const vector& vec, std::vector<value_type>& vals)
const
409 thrust::gather(I.begin(), I.end(), vec.begin(), T.begin());
410 thrust::copy(T.begin(), T.end(), vals.begin());
416 thrust::device_vector<ptrdiff_t> I;
418 scatter(
size_t size,
const std::vector<ptrdiff_t>& I,
const params&)
422 void operator()(
const vector& src, vector& dst)
const
424 thrust::scatter(src.begin(), src.end(), I.begin(), dst.begin());
435 static size_t get(
const thrust::device_vector<V>& v)
437 return v.size() *
sizeof(V);
441template <
typename Alpha,
typename Beta,
typename V>
443 Beta, thrust::device_vector<V>>
446 typedef thrust::device_vector<V> vector;
448 static void apply(Alpha alpha,
const matrix& A,
const vector& x,
449 Beta beta, vector& y)
451 A.spmv(alpha, x, beta, y);
457 thrust::device_vector<V>,
458 thrust::device_vector<V>,
459 thrust::device_vector<V>>
462 typedef thrust::device_vector<V> vector;
464 static void apply(
const vector& rhs,
const matrix& A,
const vector& x,
467 thrust::copy(rhs.begin(), rhs.end(), r.begin());
475 typedef thrust::device_vector<V> vector;
477 static void apply(vector& x)
479 thrust::fill(x.begin(), x.end(), V());
483template <
class V,
class T>
486 static void apply(
const V& x, thrust::device_vector<T>& y)
488 thrust::copy(x.begin(), x.end(), y.begin());
492template <
class T,
class V>
495 static void apply(
const thrust::device_vector<T>& x, V& y)
497 thrust::copy(x.begin(), x.end(), y.begin());
501template <
class T1,
class T2>
502struct copy_impl<thrust::device_vector<T1>, thrust::device_vector<T2>>
504 static void apply(
const thrust::device_vector<T1>& x, thrust::device_vector<T2>& y)
506 thrust::copy(x.begin(), x.end(), y.begin());
512 thrust::device_vector<V>>
514 typedef thrust::device_vector<V> vector;
516 static V get(
const vector& x,
const vector& y)
518 return thrust::inner_product(x.begin(), x.end(), y.begin(), V());
522template <
typename A,
typename B,
typename V>
524 B, thrust::device_vector<V>>
526 typedef thrust::device_vector<V> vector;
537 template <
class Tuple>
538 __host__ __device__
void operator()(Tuple t)
const
543 get<1>(t) = a * get<0>(t) + b * get<1>(t);
545 get<1>(t) = a * get<0>(t);
549 static void apply(A a,
const vector& x, B b, vector& y)
551 thrust::for_each(thrust::make_zip_iterator(
552 thrust::make_tuple(x.begin(), y.begin())),
553 thrust::make_zip_iterator(thrust::make_tuple(x.end(), y.end())),
558template <
typename A,
typename B,
typename C,
typename V>
560 B, thrust::device_vector<V>,
561 C, thrust::device_vector<V>>
563 typedef thrust::device_vector<V> vector;
571 functor(A a, B b, C c)
577 template <
class Tuple>
578 __host__ __device__
void operator()(Tuple t)
const
583 get<2>(t) = a * get<0>(t) + b * get<1>(t) + c * get<2>(t);
585 get<2>(t) = a * get<0>(t) + b * get<1>(t);
589 static void apply(A a,
const vector& x,
590 B b,
const vector& y,
594 thrust::make_zip_iterator(
596 x.begin(), y.begin(), z.begin())),
597 thrust::make_zip_iterator(
599 x.end(), y.end(), z.end())),
604template <
typename A,
typename B,
typename V>
605struct vmul_impl<A, thrust::device_vector<V>, thrust::device_vector<V>,
606 B, thrust::device_vector<V>>
608 typedef thrust::device_vector<V> vector;
619 template <
class Tuple>
620 __host__ __device__
void operator()(Tuple t)
const
625 get<2>(t) = a * get<0>(t) * get<1>(t) + b * get<2>(t);
627 get<2>(t) = a * get<0>(t) * get<1>(t);
631 static void apply(A a,
const vector& x,
const vector& y, B b, vector& z)
633 thrust::for_each(thrust::make_zip_iterator(
635 x.begin(), y.begin(), z.begin())),
636 thrust::make_zip_iterator(
638 x.end(), y.end(), z.end())),
651 float operator-(cuda_event tic)
const
654 cudaEventSynchronize(e.get());
655 cudaEventElapsedTime(&delta, tic.e.get(), e.get());
656 return delta / 1000.0f;
661 std::shared_ptr<std::remove_pointer<cudaEvent_t>::type> e;
663 static cudaEvent_t create_event()
667 cudaEventRecord(e, 0);
676 static const char* units() {
return "s"; }
CUSPARSE matrix in CSR format.
Matrix class, to be used by user.
Implementation for linear combination of two vectors.
Implementation for linear combination of three vectors.
Implementation for function returning number of bytes allocated for a matrix/vector.
Implementation for zeroing out a vector.
Implementation for vector copy.
cusparseHandle_t cusparse_handle
CUSPARSE handle.
static std::shared_ptr< vector > create_vector(size_t size, const params &)
Create vector of the specified size.
static std::shared_ptr< vector > copy_vector(std::shared_ptr< typename BuiltinBackend< real >::vector > x, const params &prm)
Copy vector from builtin backend.
static std::shared_ptr< direct_solver > create_solver(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Create direct solver for coarse level.
static std::shared_ptr< matrix > copy_matrix(std::shared_ptr< typename BuiltinBackend< real >::matrix > A, const params &prm)
Copy matrix from builtin backend.
static std::shared_ptr< vector > copy_vector(typename BuiltinBackend< real >::vector const &x, const params &)
Copy vector from builtin backend.
Implementation for inner product.
Implementation for residual error compuatation.
Implementation for matrix-vector product.
Implementation for element-wize vector product.