Arcane  4.1.12.0
Developer documentation
Loading...
Searching...
No Matches
AlephCudaMatrixCrs.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*
8 * CNC: Concurrent Number Cruncher
9 * Copyright (C) 2008 GOCAD/ASGA, INRIA/ALICE
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * If you modify this software, you should include a notice giving the
26 * name of the person performing the modification, the date of modification,
27 * and the reason for such modification.
28 *
29 * Contact: Luc Buatois
30 *
31 * buatois@gocad.org
32 *
33 * ASGA-INPL Bt. G
34 * Rue du Doyen Marcel Roubault - BP 40
35 * 54501 VANDOEUVRE LES NANCY
36 * FRANCE
37 *
38 * Note that the GNU General Public License does not permit incorporating
39 * the Software into proprietary programs.
40 */
41
42#ifndef CNC_SPARSE_MATRIX_CRS_H
43#define CNC_SPARSE_MATRIX_CRS_H
44
45namespace Arcane
46{
47//---------------------------------------------------------------------------//
48// External CUDA Kernel Functions, forward declarations //
49//---------------------------------------------------------------------------//
50
51extern "C" void cnc_cuda_mat1x1vecmult(double* matrix, unsigned int size_matrix,
52 uint2* rowptr, unsigned int size_rowptr,
53 unsigned int* colind, unsigned int size_colind,
54 double* x, double* b, unsigned int size_vec);
55
56// -------------------------------------------------------------------------- //
57// CRS Sparse matrix data structure with CPU and GPU storage //
58// -------------------------------------------------------------------------- //
59
63class CNC_MatrixPatternCRS
64{
65 public:
66
67 typedef CNC_Vector<long> IndexArray;
68 enum
69 {
70 NO_INDEX = ~0
71 };
72 CNC_MatrixPatternCRS()
73 {
74 N = 0;
75 symmetric_storage = false;
76 }
77 void clear()
78 {
79 N = 0;
80 symmetric_storage = false;
81 colind.clear();
82 rowptr.clear();
83 }
85 long m() const { return rowptr.size() - 1; }
87 long n() const { return N; }
89 long nnz() const { return colind.size(); }
91 long row_nnz(long i) const
92 {
93 return rowptr[i + 1] - rowptr[i];
94 }
95 long mem_usage() const
96 {
97 return colind.mem_usage() +
98 rowptr.mem_usage() +
99 sizeof(long) +
100 sizeof(bool);
101 }
102
103 public:
104
105 IndexArray colind;
106 IndexArray rowptr;
107 long N;
108 bool symmetric_storage;
109};
110
111//---------------------------------------------------------------------------//
112
113template <class T> class CNC_MatrixCRS : public CNC_MatrixPatternCRS
114{
115 public:
116
117 typedef CNC_Vector<T> CoeffArray;
118 inline CNC_MatrixCRS()
119 : separate_diag(true)
120 {
121 // printf("\n[CNC_MatrixCRS] NEW") ;
122 }
123 inline void mult(const T* x, T* y) const
124 {
125 // printf("\n[CNC_MatrixCRS] mult") ;
126 if (separate_diag) {
127 if (symmetric_storage) {
128 // Symmetric storage and separate diagonal
129 for (long i = 0; i < diag.size(); i++) {
130 y[i] = x[i] * diag[i];
131 }
132 for (long i = diag.size(); i < m(); i++) {
133 y[i] = 0.0;
134 }
135 for (long i = 0; i < m(); i++) {
136 for (long j = rowptr[i]; j < rowptr[i + 1]; j++) {
137 y[i] += a[j] * x[colind[j]];
138 y[colind[j]] += a[j] * x[i];
139 }
140 }
141 }
142 else {
143 // No symmetric storage and separate diagonal
144 for (long i = 0; i < m(); i++) {
145 T sum = 0.0;
146 for (long j = rowptr[i]; j < rowptr[i + 1]; j++) {
147 sum += a[j] * x[colind[j]];
148 }
149 y[i] = sum;
150 }
151 for (long i = 0; i < diag.size(); i++) {
152 y[i] += x[i] * diag[i];
153 }
154 }
155 }
156 else {
157 if (symmetric_storage) {
158 // Symmetric storage, no separate diagonal
159 memset(y, 0, sizeof(T));
160 for (long i = 0; i < m(); i++) {
161 for (long j = rowptr[i]; j < rowptr[i + 1]; j++) {
162 y[i] += a[j] * x[colind[j]];
163 if (colind[j] != i) {
164 y[colind[j]] += a[j] * x[i];
165 }
166 }
167 }
168 }
169 else {
170 // No symmetric storage, no separate diagonal
171 for (long i = 0; i < m(); i++) {
172 T sum = 0.0;
173 for (long j = rowptr[i]; j < rowptr[i + 1]; j++) {
174 sum += a[j] * x[colind[j]];
175 }
176 y[i] = sum;
177 }
178 }
179 }
180 }
181
182 inline void mult(const CNC_Vector<T>& x, CNC_Vector<T>& y) const
183 {
184 // printf("\n[CNC_MatrixCRS] mult") ;
185 mult(x.data(), y.data());
186 }
187 long mem_usage() const
188 {
189 // printf("\n[CNC_MatrixCRS] mem_usage") ;
190 return CNC_MatrixPatternCRS::mem_usage() +
191 a.mem_usage() +
192 diag.mem_usage() +
193 sizeof(bool);
194 }
195
196 inline void gpu_allocate()
197 {
198 // printf("\n[CNC_MatrixCRS] gpu_ALLOCATION") ;
199 // allocate and upload non-zero matrix coefficients
200 cublasAlloc(a.size() + 16, sizeof(double), &(gpu_mat_));
201 // cublasSetVector(a.size(), sizeof(double), a.data(), 1, gpu_mat_, 1) ;
202 // allocate and upload rowptr and colind
203 cublasAlloc(colind.size() + 16, 4, &(gpu_ci_));
204 cublasAlloc(rowptr.size() - 1 + 16, sizeof(uint2), &(gpu_redundant_rp_));
205 unsigned int* ci = CNCallocate_and_copy<long, unsigned int>(colind.data(), colind.size());
206 uint2* cpu_redundant_rp = CNCallocate<uint2>(rowptr.size() - 1);
207 {
208 for (unsigned int i = 0; i < rowptr.size() - 1; i++) {
209 cpu_redundant_rp[i].x = rowptr.data()[i];
210 cpu_redundant_rp[i].y = rowptr.data()[i + 1];
211 }
212 }
213 cublasSetVector(rowptr.size() - 1, sizeof(uint2), cpu_redundant_rp, 1, gpu_redundant_rp_, 1);
214 cublasSetVector(colind.size(), sizeof(unsigned int), ci, 1, gpu_ci_, 1);
215 CNCdeallocate<uint2>(cpu_redundant_rp);
216 CNCdeallocate<unsigned int>(ci);
217 }
218
219 inline void gpu_upload()
220 {
221 // printf("\n[CNC_MatrixCRS] gpu_UPLOAD") ;
222 // allocate and upload non-zero matrix coefficients
223 // cublasAlloc(a.size()+16, sizeof(double), &(gpu_mat_) ) ;
224 cublasSetVector(a.size(), sizeof(double), a.data(), 1, gpu_mat_, 1);
225 // allocate and upload rowptr and colind
226 // cublasAlloc(colind.size()+16, 4, &(gpu_ci_) ) ;
227 // cublasAlloc(rowptr.size()-1+16, sizeof(uint2), &(gpu_redundant_rp_)) ;
228 /* unsigned int * ci = CNCallocate_and_copy<long, unsigned int> ( colind.data(), colind.size() ) ;
229 uint2 * cpu_redundant_rp = CNCallocate<uint2> ( rowptr.size()-1 ) ;
230 {
231 for (unsigned int i=0; i<rowptr.size()-1; i++) {
232 cpu_redundant_rp[i].x = rowptr.data()[i ] ;
233 cpu_redundant_rp[i].y = rowptr.data()[i+1] ;
234 }
235 }
236 cublasSetVector(rowptr.size()-1, sizeof(uint2),cpu_redundant_rp, 1,gpu_redundant_rp_, 1 ) ;
237 cublasSetVector(colind.size(), sizeof(unsigned int),ci, 1, gpu_ci_, 1 ) ;
238 CNCdeallocate<uint2> ( cpu_redundant_rp ) ;
239 CNCdeallocate<unsigned int> ( ci ) ;*/
240 }
241
242 inline void gpu_deallocate()
243 {
244 // printf("\n[CNC_MatrixCRS] gpu_DEALLOCATION") ;
245 cublasFree(gpu_mat_);
246 cublasFree(gpu_ci_);
247 cublasFree(gpu_redundant_rp_);
248 }
249 inline void gpu_mult(const void* x, void* y, unsigned int vec_size)
250 {
251 // printf("\n[CNC_MatrixCRS] gpu_mult") ;
252 cnc_cuda_mat1x1vecmult((double*)gpu_mat_, a.size(),
253 (uint2*)gpu_redundant_rp_, rowptr.size(),
254 (unsigned int*)gpu_ci_, colind.size(),
255 (double*)x, (double*)y, vec_size);
256 }
257 inline void print(void)
258 {
259 printf("\n[CNC_MatrixCRS] print\n");
260 for (long i = 0; i < m(); i++) {
261 for (long jj = rowptr[i]; jj < rowptr[i + 1]; jj++) {
262 if (a[jj] != 0.0) {
263 long j = colind[jj];
264 printf("\t%d %d %f\n", i, j, a[jj]);
265 if (symmetric_storage && j != i) {
266 printf("\t%d %d %f\n", j, i, a[jj]);
267 }
268 }
269 }
270 }
271 if (separate_diag) {
272 for (long i = 0; i < diag.size(); i++) {
273 if (diag[i] != 0.0) {
274 printf("\t%d %d %f\n", i, i, diag[i]);
275 }
276 }
277 }
278 }
279
280 public:
281
282 CoeffArray a;
283 CoeffArray diag;
284 bool separate_diag;
285 // GPU data
286 void* gpu_redundant_rp_;
287 void* gpu_ci_;
288 void* gpu_mat_;
289};
290} // namespace Arcane
291
292#endif
const T * data() const
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --