113template <
class T>
class CNC_MatrixCRS :
public CNC_MatrixPatternCRS
118 inline CNC_MatrixCRS()
119 : separate_diag(
true)
123 inline void mult(
const T* x, T* y)
const
127 if (symmetric_storage) {
129 for (
long i = 0; i < diag.size(); i++) {
130 y[i] = x[i] * diag[i];
132 for (
long i = diag.size(); i <
m(); i++) {
135 for (
long i = 0; i <
m(); i++) {
136 for (
long j = rowptr[i]; j < rowptr[i + 1]; j++) {
137 y[i] += a[j] * x[colind[j]];
138 y[colind[j]] += a[j] * x[i];
144 for (
long i = 0; i <
m(); i++) {
146 for (
long j = rowptr[i]; j < rowptr[i + 1]; j++) {
147 sum += a[j] * x[colind[j]];
151 for (
long i = 0; i < diag.size(); i++) {
152 y[i] += x[i] * diag[i];
157 if (symmetric_storage) {
159 memset(y, 0,
sizeof(T));
160 for (
long i = 0; i <
m(); i++) {
161 for (
long j = rowptr[i]; j < rowptr[i + 1]; j++) {
162 y[i] += a[j] * x[colind[j]];
163 if (colind[j] != i) {
164 y[colind[j]] += a[j] * x[i];
171 for (
long i = 0; i <
m(); i++) {
173 for (
long j = rowptr[i]; j < rowptr[i + 1]; j++) {
174 sum += a[j] * x[colind[j]];
185 mult(x.data(), y.
data());
187 long mem_usage()
const
190 return CNC_MatrixPatternCRS::mem_usage() +
196 inline void gpu_allocate()
200 cublasAlloc(a.size() + 16,
sizeof(
double), &(gpu_mat_));
203 cublasAlloc(colind.size() + 16, 4, &(gpu_ci_));
204 cublasAlloc(rowptr.size() - 1 + 16,
sizeof(uint2), &(gpu_redundant_rp_));
205 unsigned int* ci = CNCallocate_and_copy<long, unsigned int>(colind.data(), colind.size());
206 uint2* cpu_redundant_rp = CNCallocate<uint2>(rowptr.size() - 1);
208 for (
unsigned int i = 0; i < rowptr.size() - 1; i++) {
209 cpu_redundant_rp[i].x = rowptr.data()[i];
210 cpu_redundant_rp[i].y = rowptr.data()[i + 1];
213 cublasSetVector(rowptr.size() - 1,
sizeof(uint2), cpu_redundant_rp, 1, gpu_redundant_rp_, 1);
214 cublasSetVector(colind.size(),
sizeof(
unsigned int), ci, 1, gpu_ci_, 1);
215 CNCdeallocate<uint2>(cpu_redundant_rp);
216 CNCdeallocate<unsigned int>(ci);
219 inline void gpu_upload()
224 cublasSetVector(a.size(),
sizeof(
double), a.data(), 1, gpu_mat_, 1);
242 inline void gpu_deallocate()
245 cublasFree(gpu_mat_);
247 cublasFree(gpu_redundant_rp_);
249 inline void gpu_mult(
const void* x,
void* y,
unsigned int vec_size)
252 cnc_cuda_mat1x1vecmult((
double*)gpu_mat_, a.size(),
253 (uint2*)gpu_redundant_rp_, rowptr.size(),
254 (
unsigned int*)gpu_ci_, colind.size(),
255 (
double*)x, (
double*)y, vec_size);
257 inline void print(
void)
259 printf(
"\n[CNC_MatrixCRS] print\n");
260 for (
long i = 0; i <
m(); i++) {
261 for (
long jj = rowptr[i]; jj < rowptr[i + 1]; jj++) {
264 printf(
"\t%d %d %f\n", i, j, a[jj]);
265 if (symmetric_storage && j != i) {
266 printf(
"\t%d %d %f\n", j, i, a[jj]);
272 for (
long i = 0; i < diag.size(); i++) {
273 if (diag[i] != 0.0) {
274 printf(
"\t%d %d %f\n", i, i, diag[i]);
286 void* gpu_redundant_rp_;