Arcane  v3.14.10.0
Documentation développeur
Chargement...
Recherche...
Aucune correspondance
AlephCudaMatrixCrs.h
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2022 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*
8 * CNC: Concurrent Number Cruncher
9 * Copyright (C) 2008 GOCAD/ASGA, INRIA/ALICE
10 *
11 * This program is free software; you can redistribute it and/or modify
12 * it under the terms of the GNU General Public License as published by
13 * the Free Software Foundation; either version 2 of the License, or
14 * (at your option) any later version.
15 *
16 * This program is distributed in the hope that it will be useful,
17 * but WITHOUT ANY WARRANTY; without even the implied warranty of
18 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19 * GNU General Public License for more details.
20 *
21 * You should have received a copy of the GNU General Public License
22 * along with this program; if not, write to the Free Software
23 * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24 *
25 * If you modify this software, you should include a notice giving the
26 * name of the person performing the modification, the date of modification,
27 * and the reason for such modification.
28 *
29 * Contact: Luc Buatois
30 *
31 * buatois@gocad.org
32 *
33 * ASGA-INPL Bt. G
34 * Rue du Doyen Marcel Roubault - BP 40
35 * 54501 VANDOEUVRE LES NANCY
36 * FRANCE
37 *
38 * Note that the GNU General Public License does not permit incorporating
39 * the Software into proprietary programs.
40 */
41
42#ifndef CNC_SPARSE_MATRIX_CRS_H
43#define CNC_SPARSE_MATRIX_CRS_H
44
45ARCANE_BEGIN_NAMESPACE
46
47//---------------------------------------------------------------------------//
48// External CUDA Kernel Functions, forward declarations //
49//---------------------------------------------------------------------------//
50
51extern "C" void cnc_cuda_mat1x1vecmult (double * matrix, unsigned int size_matrix,
52 uint2 * rowptr, unsigned int size_rowptr,
53 unsigned int * colind, unsigned int size_colind,
54 double * x, double * b, unsigned int size_vec ) ;
55
56
57// -------------------------------------------------------------------------- //
58// CRS Sparse matrix data structure with CPU and GPU storage //
59// -------------------------------------------------------------------------- //
60
65public:
67 enum { NO_INDEX = ~0 } ;
69 N = 0 ;
70 symmetric_storage = false ;
71 }
72 void clear() {
73 N = 0 ;
74 symmetric_storage = false ;
75 colind.clear() ;
76 rowptr.clear() ;
77 }
79 long m() const { return rowptr.size() - 1 ; }
81 long n() const { return N ; }
83 long nnz() const { return colind.size() ; }
85 long row_nnz(long i) const {
86 return rowptr[i+1] - rowptr[i] ;
87 }
88 long mem_usage() const {
89 return colind.mem_usage() +
90 rowptr.mem_usage() +
91 sizeof(long) +
92 sizeof(bool) ;
93 }
94public:
95 IndexArray colind ;
96 IndexArray rowptr ;
97 long N ;
98 bool symmetric_storage ;
99} ;
100
101
102//---------------------------------------------------------------------------//
103
104template<class T> class CNC_MatrixCRS : public CNC_MatrixPatternCRS {
105public:
106
107 typedef CNC_Vector<T> CoeffArray ;
108 inline CNC_MatrixCRS() : separate_diag(true) {
109// printf("\n[CNC_MatrixCRS] NEW") ;
110 }
111 inline void mult(const T* x, T* y) const{
112// printf("\n[CNC_MatrixCRS] mult") ;
113 if(separate_diag) {
114 if(symmetric_storage) {
115 // Symmetric storage and separate diagonal
116 for(long i=0; i<diag.size(); i++) {
117 y[i] = x[i] * diag[i] ;
118 }
119 for(long i=diag.size(); i<m(); i++) {
120 y[i] = 0.0 ;
121 }
122 for(long i=0; i<m(); i++) {
123 for(long j=rowptr[i] ; j<rowptr[i+1]; j++) {
124 y[i] += a[j] * x[colind[j]] ;
125 y[colind[j]] += a[j] * x[i] ;
126 }
127 }
128 } else {
129 // No symmetric storage and separate diagonal
130 for(long i=0; i<m(); i++) {
131 T sum = 0.0 ;
132 for(long j=rowptr[i] ; j<rowptr[i+1]; j++) {
133 sum += a[j] * x[colind[j]] ;
134 }
135 y[i] = sum ;
136 }
137 for(long i=0; i<diag.size(); i++) {
138 y[i] += x[i] * diag[i] ;
139 }
140 }
141 } else {
142 if(symmetric_storage) {
143 // Symmetric storage, no separate diagonal
144 memset ( y, 0, sizeof(T) ) ;
145 for(long i=0; i<m(); i++) {
146 for(long j=rowptr[i] ; j<rowptr[i+1]; j++) {
147 y[i] += a[j] * x[colind[j]] ;
148 if(colind[j] != i) {
149 y[colind[j]] += a[j] * x[i] ;
150 }
151 }
152 }
153 } else {
154 // No symmetric storage, no separate diagonal
155 for(long i=0; i<m(); i++) {
156 T sum = 0.0 ;
157 for(long j=rowptr[i] ; j<rowptr[i+1]; j++) {
158 sum += a[j] * x[colind[j]] ;
159 }
160 y[i] = sum ;
161 }
162 }
163 }
164 }
165
166 inline void mult(const CNC_Vector<T>& x, CNC_Vector<T>& y) const {
167// printf("\n[CNC_MatrixCRS] mult") ;
168 mult(x.data(), y.data()) ;
169 }
170 long mem_usage() const {
171// printf("\n[CNC_MatrixCRS] mem_usage") ;
172 return CNC_MatrixPatternCRS::mem_usage() +
173 a.mem_usage() +
174 diag.mem_usage() +
175 sizeof(bool) ;
176 }
177
178 inline void gpu_allocate(){
179// printf("\n[CNC_MatrixCRS] gpu_ALLOCATION") ;
180 // allocate and upload non-zero matrix coefficients
181 cublasAlloc(a.size()+16, sizeof(double), &(gpu_mat_) ) ;
182// cublasSetVector(a.size(), sizeof(double), a.data(), 1, gpu_mat_, 1) ;
183 // allocate and upload rowptr and colind
184 cublasAlloc(colind.size()+16, 4, &(gpu_ci_) ) ;
185 cublasAlloc(rowptr.size()-1+16, sizeof(uint2), &(gpu_redundant_rp_)) ;
186 unsigned int * ci = CNCallocate_and_copy<long, unsigned int> ( colind.data(), colind.size() ) ;
187 uint2 * cpu_redundant_rp = CNCallocate<uint2> ( rowptr.size()-1 ) ;
188 {
189 for (unsigned int i=0; i<rowptr.size()-1; i++) {
190 cpu_redundant_rp[i].x = rowptr.data()[i ] ;
191 cpu_redundant_rp[i].y = rowptr.data()[i+1] ;
192 }
193 }
194 cublasSetVector(rowptr.size()-1, sizeof(uint2),cpu_redundant_rp, 1,gpu_redundant_rp_, 1 ) ;
195 cublasSetVector(colind.size(), sizeof(unsigned int),ci, 1, gpu_ci_, 1 ) ;
198 }
199
200 inline void gpu_upload(){
201// printf("\n[CNC_MatrixCRS] gpu_UPLOAD") ;
202 // allocate and upload non-zero matrix coefficients
203// cublasAlloc(a.size()+16, sizeof(double), &(gpu_mat_) ) ;
204 cublasSetVector(a.size(), sizeof(double), a.data(), 1, gpu_mat_, 1) ;
205 // allocate and upload rowptr and colind
206// cublasAlloc(colind.size()+16, 4, &(gpu_ci_) ) ;
207// cublasAlloc(rowptr.size()-1+16, sizeof(uint2), &(gpu_redundant_rp_)) ;
208/* unsigned int * ci = CNCallocate_and_copy<long, unsigned int> ( colind.data(), colind.size() ) ;
209 uint2 * cpu_redundant_rp = CNCallocate<uint2> ( rowptr.size()-1 ) ;
210 {
211 for (unsigned int i=0; i<rowptr.size()-1; i++) {
212 cpu_redundant_rp[i].x = rowptr.data()[i ] ;
213 cpu_redundant_rp[i].y = rowptr.data()[i+1] ;
214 }
215 }
216 cublasSetVector(rowptr.size()-1, sizeof(uint2),cpu_redundant_rp, 1,gpu_redundant_rp_, 1 ) ;
217 cublasSetVector(colind.size(), sizeof(unsigned int),ci, 1, gpu_ci_, 1 ) ;
218 CNCdeallocate<uint2> ( cpu_redundant_rp ) ;
219 CNCdeallocate<unsigned int> ( ci ) ;*/
220 }
221
222
223 inline void gpu_deallocate () {
224// printf("\n[CNC_MatrixCRS] gpu_DEALLOCATION") ;
225 cublasFree(gpu_mat_) ;
226 cublasFree(gpu_ci_);
227 cublasFree(gpu_redundant_rp_);
228 }
229 inline void gpu_mult(const void *x, void *y, unsigned int vec_size){
230// printf("\n[CNC_MatrixCRS] gpu_mult") ;
231 cnc_cuda_mat1x1vecmult((double*)gpu_mat_, a.size(),
232 (uint2*)gpu_redundant_rp_, rowptr.size(),
233 (unsigned int*)gpu_ci_, colind.size(),
234 (double*)x, (double*)y, vec_size ) ;
235
236 }
237 inline void print(void) {
238 printf("\n[CNC_MatrixCRS] print\n") ;
239 for(long i=0; i<m(); i++) {
240 for(long jj=rowptr[i]; jj<rowptr[i+1]; jj++) {
241 if(a[jj] != 0.0) {
242 long j = colind[jj] ;
243 printf ( "\t%d %d %f\n", i, j, a[jj] ) ;
244 if(symmetric_storage && j != i) {
245 printf ( "\t%d %d %f\n", j, i, a[jj] ) ;
246 }
247 }
248 }
249 }
250 if(separate_diag) {
251 for(long i=0; i<diag.size(); i++) {
252 if(diag[i] != 0.0) {
253 printf ( "\t%d %d %f\n", i, i, diag[i] ) ;
254 }
255 }
256 }
257 }
258public:
259 CoeffArray a ;
260 CoeffArray diag ;
261 bool separate_diag ;
262 // GPU data
263 void * gpu_redundant_rp_ ;
264 void * gpu_ci_ ;
265 void * gpu_mat_ ;
266} ;
267
268
269
270ARCANE_END_NAMESPACE
271
272#endif
Lecteur des fichiers de maillage via la bibliothèque LIMA.
Definition Lima.cc:120