Alien  1.3.0
Developer documentation
Loading...
Searching...
No Matches
SYCLBEllPackMatrix.cc
1// -*- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature -*-
2//-----------------------------------------------------------------------------
3// Copyright 2000-2026 CEA (www.cea.fr) IFPEN (www.ifpenergiesnouvelles.com)
4// See the top-level COPYRIGHT file for details.
5// SPDX-License-Identifier: Apache-2.0
6//-----------------------------------------------------------------------------
7/*
8 * Copyright 2020 IFPEN-CEA
9 *
10 * Licensed under the Apache License, Version 2.0 (the "License");
11 * you may not use this file except in compliance with the License.
12 * You may obtain a copy of the License at
13 *
14 * http://www.apache.org/licenses/LICENSE-2.0
15 *
16 * Unless required by applicable law or agreed to in writing, software
17 * distributed under the License is distributed on an "AS IS" BASIS,
18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 * See the License for the specific language governing permissions and
20 * limitations under the License.
21 *
22 * SPDX-License-Identifier: Apache-2.0
23 */
24
25#include <cassert>
26
27#include <alien/kernels/sycl/data/SYCLEnvInternal.h>
28#include <alien/kernels/sycl/data/SYCLEnv.h>
29
30#include <alien/kernels/sycl/data/SYCLVectorInternal.h>
31#include <alien/kernels/sycl/data/SYCLVector.h>
32
33#include <alien/kernels/sycl/data/SYCLBEllPackInternal.h>
34#include <alien/kernels/sycl/data/SYCLBEllPackMatrix.h>
35
36/*---------------------------------------------------------------------------*/
37
38namespace Alien
39{
40
41SYCLEnv::SYCLEnv()
42{
43 SYCLInternal::EnvInternal::printPlatformInfo() ;
44 m_internal.reset(new SYCLInternal::EnvInternal{});
45}
46
47SYCLEnv::SYCLEnv(int device_id)
48{
49 SYCLInternal::EnvInternal::printPlatformInfo() ;
50 auto gpu_devices = sycl::device::get_devices(sycl::info::device_type::gpu);
51 if(device_id < gpu_devices.size())
52 {
53 sycl::device selected_device = gpu_devices[device_id];
54 m_internal.reset(new SYCLInternal::EnvInternal{selected_device,device_id});
55 }
56 else
57 m_internal.reset(new SYCLInternal::EnvInternal{});
58}
59
60
61SYCLEnv::~SYCLEnv()
62{
63}
64
65SYCLEnv* SYCLEnv::m_instance = nullptr;
66
67SYCLEnv* SYCLEnv::instance()
68{
69 if (!m_instance)
70 m_instance = new SYCLEnv{};
71 return m_instance;
72}
73
74SYCLEnv* SYCLEnv::instance(int device_id)
75{
76 if (!m_instance)
77 m_instance = new SYCLEnv{device_id};
78 if (m_instance->deviceId()!= device_id)
79 {
80 delete m_instance ;
81 m_instance = new SYCLEnv{device_id};
82 }
83 return m_instance;
84}
85
86int SYCLEnv::deviceId()
87{
88 return m_internal->deviceId() ;
89}
90
91std::size_t SYCLEnv::maxNumGroups()
92{
93 return m_internal->maxNumGroups();
94}
95
96std::size_t SYCLEnv::maxWorkGroupSize()
97{
98 return m_internal->maxWorkGroupSize();
99}
100
101std::size_t SYCLEnv::maxNumThreads()
102{
103 return m_internal->maxNumThreads();
104}
105
106template <int EllpackSize, typename IndexT>
107void BEllPackStructInfo<EllpackSize, IndexT>::computeBlockRowOffset(std::vector<int>& block_row_offset,
108 std::size_t nrows,
109 int const* kcol)
110{
111 std::size_t block_nrows = BEllPackStructInfo<EllpackSize, IndexT>::nbBlocks(nrows);
112 block_row_offset.resize(block_nrows + 1);
113 int offset = 0;
114 for (std::size_t ib = 0; ib < block_nrows; ++ib) {
115 block_row_offset[ib] = offset;
116 int max_row_size = 0;
117 for (int i = 0; i < std::min(EllpackSize, int(nrows - ib * EllpackSize)); ++i) {
118 auto irow = ib * EllpackSize + i;
119 int row_size = kcol[irow + 1] - kcol[irow];
120 max_row_size = std::max(max_row_size, row_size);
121 }
122 offset += max_row_size;
123 }
124 block_row_offset[block_nrows] = offset;
125}
126
127template <int EllpackSize, typename IndexT>
129StructInfoInternal(std::size_t nrows,
130 std::size_t nnz,
131 std::size_t block_nrows,
132 std::size_t block_nnz,
133 int const* h_kcol,
134 int const* h_cols,
135 int const* h_block_row_offset,
136 int const* h_local_row_size)
137: m_nrows(nrows)
138, m_nnz(nnz)
139, m_block_nrows(block_nrows)
140, m_block_nnz(block_nnz)
141, m_h_kcol(h_kcol, h_kcol + nrows + 1)
142, m_h_cols(h_cols, h_cols + nnz)
143, m_h_block_cols(block_nnz * ellpack_size)
144, m_block_row_offset(h_block_row_offset, sycl::range<1>(m_block_nrows + 1))
145, m_block_cols(m_h_block_cols.data(), sycl::range<1>(m_block_nnz * ellpack_size))
146, m_kcol(m_h_kcol.data(), sycl::range<1>(m_nrows + 1))
147{
148 auto env = SYCLEnv::instance();
149
150 auto& queue = env->internal()->queue();
151
152 auto num_groups = env->internal()->maxNumGroups();
153
154 auto max_work_group_size = env->internal()->maxWorkGroupSize();
155
156 // building the best number of global thread
157 auto total_threads = num_groups * ellpack_size;
158
159 // clang-format off
160 if(h_local_row_size==nullptr)
161 {
162 IndexBufferType cols_buffer(m_h_cols.data(), sycl::range<1>(m_nnz));
163
164 queue.submit([&](sycl::handler& cgh)
165 {
166 auto access_kcol_buffer = m_kcol.template get_access<sycl::access::mode::read>(cgh);
167 auto access_block_row_offset = m_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
168
169 auto access_cols_buffer = cols_buffer.template get_access<sycl::access::mode::read>(cgh);
170 auto access_block_cols = m_block_cols.template get_access<sycl::access::mode::discard_write>(cgh);
171
172 cgh.parallel_for<class struct_info_sycl>(sycl::range<1>{total_threads},
173 [=] (sycl::item<1> itemId)
174 {
175 auto id = itemId.get_id(0);
176
177 for (auto i = id; i < nrows; i += itemId.get_range()[0])
178 {
179 auto block_id = i/ellpack_size ;
180 auto local_id = i%ellpack_size ;
181
182 int begin = access_kcol_buffer[i] ;
183 int end = access_kcol_buffer[i+1] ;
184 int row_size = end - begin ;
185
186 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
187 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
188
189 //access_rx[i] = row_size;
190 /*
191 access_brs[i] = block_row_size ;
192 access_begin[i] = begin ;
193 access_end[i] = end ;
194 access_first_cols[i] = access_cols_buffer[begin] ;
195 */
196 for(int k=begin;k<end;++k)
197 {
198 access_block_cols[block_row_offset+(k-begin)*ellpack_size+local_id] = access_cols_buffer[k] ;
199 }
200 for(int k=row_size;k<block_row_size;++k)
201 {
202 access_block_cols[block_row_offset+k*ellpack_size+local_id] = -1 ;
203 }
204
205 }
206 });
207 });
208 }
209 else
210 {
211 IndexBufferType cols_buffer(m_h_cols.data(), sycl::range<1>(m_nnz));
212 IndexBufferType lrowsize_buffer(h_local_row_size, sycl::range<1>(nrows));
213 // clang-format off
214 queue.submit([&](sycl::handler& cgh)
215 {
216 auto access_kcol_buffer = m_kcol.template get_access<sycl::access::mode::read>(cgh);
217 auto access_block_row_offset = m_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
218 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
219
220 auto access_cols_buffer = cols_buffer.template get_access<sycl::access::mode::read>(cgh);
221 auto access_block_cols = m_block_cols.template get_access<sycl::access::mode::discard_write>(cgh);
222
223 cgh.parallel_for<class struct_info_sycl2>(sycl::range<1>{total_threads},
224 [=] (sycl::item<1> itemId)
225 {
226 auto id = itemId.get_id(0);
227
228 for (auto i = id; i < nrows; i += itemId.get_range()[0])
229 {
230 auto block_id = i/ellpack_size ;
231 auto local_id = i%ellpack_size ;
232
233 int begin = access_kcol_buffer[i] ;
234 int lrow_size = access_lrowsize_buffer[i] ;
235
236 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
237 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
238
239 for(int k=begin;k<begin+lrow_size;++k)
240 {
241 access_block_cols[block_row_offset+(k-begin)*ellpack_size+local_id] = access_cols_buffer[k] ;
242 }
243 for(int k=lrow_size;k<block_row_size;++k)
244 {
245 access_block_cols[block_row_offset+k*ellpack_size+local_id] = -1 ;
246 }
247 }
248 });
249 });
250 }
251 // clang-format on
252}
253
254template <int EllpackSize, typename IndexT>
255void SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::getUpperDiagOffset() const
256{
257 if (m_h_dcol.size() == 0) {
258 m_h_dcol.resize(m_nrows);
259 for (std::size_t irow = 0; irow < m_nrows; ++irow) {
260 int index = m_h_kcol[irow];
261 for (int k = m_h_kcol[irow]; k < m_h_kcol[irow + 1]; ++k) {
262 if ((std::size_t)m_h_cols[k] < irow)
263 ++index;
264 else
265 break;
266 }
267 m_h_dcol[irow] = index;
268 }
269 }
270}
271
272template <int EllpackSize, typename IndexT>
273void SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::computeLowerUpperMask() const
274{
275
276 if (not m_lower_upper_mask_ready) {
277 m_lower_mask.reset(new MaskBufferType(sycl::range<1>(m_block_nnz * ellpack_size)));
278 m_upper_mask.reset(new MaskBufferType(sycl::range<1>(m_block_nnz * ellpack_size)));
279
280 auto env = SYCLEnv::instance();
281
282 auto& queue = env->internal()->queue();
283
284 auto num_groups = env->internal()->maxNumGroups();
285
286 auto max_work_group_size = env->internal()->maxWorkGroupSize();
287
288 // building the best number of global thread
289 auto total_threads = num_groups * ellpack_size;
290
291 {
292 IndexBufferType dcol_buffer(m_h_dcol.data(), sycl::range<1>(m_nrows));
293
294 auto nrows = m_nrows;
295
296 // clang-format off
297 queue.submit([&](sycl::handler& cgh)
298 {
299 auto access_dcol_buffer = dcol_buffer.template get_access<sycl::access::mode::read>(cgh);
300
301 auto access_kcol_buffer = m_kcol.template get_access<sycl::access::mode::read>(cgh);
302 auto access_block_row_offset = m_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
303
304 auto access_lower_mask = sycl::accessor { *m_lower_mask, cgh, sycl::write_only, sycl::property::no_init{}};
305 auto access_upper_mask = sycl::accessor { *m_upper_mask, cgh, sycl::write_only, sycl::property::no_init{}};
306
307
308 cgh.parallel_for<class lower_upper_mask>(sycl::range<1>{total_threads},
309 [=] (sycl::item<1> itemId)
310 {
311 auto id = itemId.get_id(0);
312
313 for (auto i = id; i < nrows; i += itemId.get_range()[0])
314 {
315 auto block_id = i/ellpack_size ;
316 auto local_id = i%ellpack_size ;
317
318 int begin = access_kcol_buffer[i] ;
319 int end = access_kcol_buffer[i+1] ;
320 int diag = access_dcol_buffer[i] ;
321 int row_size = end - begin ;
322
323 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
324 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
325
326 access_lower_mask[block_row_offset+(diag-begin)*ellpack_size+local_id] = 0 ;
327 access_upper_mask[block_row_offset+(diag-begin)*ellpack_size+local_id] = 0 ;
328
329 for(int k=begin;k<diag;++k)
330 {
331 access_lower_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 1 ;
332 access_upper_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 0 ;
333 }
334 for(int k=diag+1;k<end;++k)
335 {
336 access_lower_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 0 ;
337 access_upper_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 1 ;
338 }
339 for(int k=row_size;k<block_row_size;++k)
340 {
341 access_lower_mask[block_row_offset+k*ellpack_size+local_id] = 0 ;
342 access_upper_mask[block_row_offset+k*ellpack_size+local_id] = 0 ;
343 }
344
345 }
346 });
347 });
348 // clang-format on
349 }
350 m_lower_upper_mask_ready = true;
351 }
352}
353
354template <int EllpackSize, typename IndexT>
355typename SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::MaskBufferType&
356SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::getLowerMask() const
357{
358 computeLowerUpperMask();
359 return *m_lower_mask;
360}
361
362template <int EllpackSize, typename IndexT>
363typename SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::MaskBufferType&
364SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::getUpperMask() const
365{
366 computeLowerUpperMask();
367 return *m_upper_mask;
368}
369
370template <int EllpackSize, typename IndexT>
371BEllPackStructInfo<EllpackSize, IndexT>::BEllPackStructInfo(std::size_t nrows,
372 int const* kcol,
373 int const* cols,
374 int const* h_block_row_offset,
375 int const* h_local_row_size)
376
377: BaseBEllPackStructInfo(nrows, kcol[nrows])
378, m_block_nrows(BEllPackStructInfo<EllpackSize, IndexT>::nbBlocks(nrows))
379, m_block_nnz(h_block_row_offset[m_block_nrows])
380, m_h_local_row_size(h_local_row_size)
381{
382 // clang-format off
383 alien_debug([&] {
384 cout() << "COMPUTE BELLPACK PROFILE";
385 cout() << " NROWS : "<<this->m_nrows;
386 cout() << " NNZ : "<<this->m_nnz;
387 cout() << " BLOCK NROWS : "<<m_block_nrows;
388 cout() <<" BLOCK NNZ : "<<m_block_nnz;
389 });
390 // clang-format on
391
392 //m_h_block_cols.assign(m_block_nnz*ellpack_size,-1) ;
393 m_internal = new InternalType{ this->m_nrows,
394 this->m_nnz,
395 m_block_nrows,
396 m_block_nnz,
397 kcol,
398 cols,
399 h_block_row_offset,
400 h_local_row_size };
401}
402
403template <int EllpackSize, typename IndexT>
404BEllPackStructInfo<EllpackSize, IndexT>::~BEllPackStructInfo()
405{
406 delete m_internal ;
407}
408
409template <int EllpackSize, typename IndexT>
410typename BEllPackStructInfo<EllpackSize, IndexT>::IndexType const*
411BEllPackStructInfo<EllpackSize, IndexT>::kcol() const
412{
413 return m_internal->kcol();
414}
415
416template <int EllpackSize, typename IndexT>
417typename BEllPackStructInfo<EllpackSize, IndexT>::IndexType const*
418BEllPackStructInfo<EllpackSize, IndexT>::cols() const
419{
420 return m_internal->cols();
421}
422
423template <int EllpackSize, typename IndexT>
424typename BEllPackStructInfo<EllpackSize, IndexT>::IndexType const*
425BEllPackStructInfo<EllpackSize, IndexT>::dcol() const
426{
427 return m_internal->dcol();
428}
429
430namespace SYCLInternal
431{
432 template <typename ValueT, int EllpackSize>
433 MatrixInternal<ValueT, EllpackSize>::MatrixInternal(ProfileType const* profile, int N)
434 : m_profile(profile)
435 , m_N(N)
436 , m_NxN(N*N)
437 , m_h_values(profile->getBlockNnz() * ellpack_size * N*N)
438 , m_values(m_h_values.data(), sycl::range<1>(profile->getBlockNnz() * ellpack_size * N*N))
439 {
440 //m_values.set_final_data(nullptr);
441 //alien_debug([&] { cout() << "SYCL InternalMATRIX" << profile->getBlockNnz() * ellpack_size<< "N="<<m_N; });
442 }
443
444 template <typename ValueT, int EllpackSize>
445 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValuesFromHost()
446 {
447 //alien_debug([&] { cout() << "SYCLMatrix setMatrixValuesFromHost "; });
448 auto env = SYCLEnv::instance();
449 auto& queue = env->internal()->queue();
450 auto num_groups = env->internal()->maxNumGroups();
451 auto max_work_group_size = env->internal()->maxWorkGroupSize();
452 auto total_threads = num_groups * ellpack_size;
453
454 int N = this->m_N;
455 int NxN = this->m_NxN;
456 int pack_size = ellpack_size ;
457 auto nrows = m_profile->getNRows();
458 auto nnz = m_profile->getNnz();
459 auto block_nnz = m_profile->getBlockNnz();
460
461 auto internal_profile = m_profile->internal();
462 auto& kcol = internal_profile->getKCol();
463 auto& block_row_offset = internal_profile->getBlockRowOffset();
464 auto& block_cols = internal_profile->getBlockCols();
465 auto local_row_size = m_profile->localRowSize();
466 if(N==1)
467 {
468 if (local_row_size == nullptr)
469 {
470 assert(m_h_csr_values.size()>=nnz) ;
471 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
472 // COMPUTE COLS
473 // clang-format off
474 queue.submit([&](sycl::handler& cgh)
475 {
476 auto access_kcol_buffer = kcol.template get_access<sycl::access::mode::read>(cgh);
477 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
478 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
479 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
480
481 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
482 cgh.parallel_for<class set_matrix_values>(
483 sycl::range<1>{total_threads},
484 [=] (sycl::item<1> item_id)
485 {
486 auto id = item_id.get_id(0);
487 //auto local_id = item_id.get_local_id(0);
488 //auto block_id = item_id.get_group(0) ;
489 //auto global_id = item_id.get_global_id(0);
490
491 for (auto i = id; i < nrows; i += item_id.get_range()[0])
492 {
493 auto block_id = i/ellpack_size ;
494 auto local_id = i%ellpack_size ;
495
496 auto begin = access_kcol_buffer[i] ;
497 auto end = access_kcol_buffer[i+1] ;
498 auto row_size = end - begin ;
499
500 int block_row_offset = access_block_row_offset[block_id];
501 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
502
503 for(int k=0;k<row_size;++k)
504 {
505 access_block_values[(block_row_offset+k)*ellpack_size+local_id] = access_values_buffer[begin+k] ;
506 }
507 for(int k=row_size;k<block_row_size;++k)
508 {
509 access_block_values[(block_row_offset+k)*ellpack_size+local_id] = 0 ;
510 }
511 }
512 });
513 }) ;
514 // clang-format on
515 m_values_is_update = true;
516 }
517 else
518 {
519 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
520 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
521 // COMPUTE COLS
522 // clang-format off
523 queue.submit([&](sycl::handler& cgh)
524 {
525 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
526 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
527
528 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
529 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
530 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
531
532 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
533 cgh.parallel_for<class set_matrix_values2>(
534 sycl::range<1>{total_threads},
535 [=] (sycl::item<1> item_id)
536 {
537 auto id = item_id.get_id(0);
538 //auto local_id = item_id.get_local_id(0);
539 //auto block_id = item_id.get_group(0) ;
540 //auto global_id = item_id.get_global_id(0);
541
542 for (auto i = id; i < nrows; i += item_id.get_range()[0])
543 {
544 auto block_id = i/pack_size ;
545 auto local_id = i%pack_size ;
546
547 auto begin = access_kcol_buffer[i] ;
548 auto lrow_size = access_lrowsize_buffer[i] ;
549 auto end = begin + lrow_size ;
550
551 int block_row_offset = access_block_row_offset[block_id];
552 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
553
554 for(int k=0;k<lrow_size;++k)
555 {
556 access_block_values[(block_row_offset+k)*pack_size+local_id] = access_values_buffer[begin+k] ;
557 }
558 for(int k=lrow_size;k<block_row_size;++k)
559 {
560 access_block_values[(block_row_offset+k)*pack_size+local_id] = 0. ;
561 }
562 }
563 });
564 }) ;
565
566 auto interface_nrows = m_ext_profile->getNRows();
567 auto ext_nnz = m_ext_profile->getNnz();
568 auto ext_block_nnz = m_ext_profile->getBlockNnz();
569
570 auto ext_internal_profile = m_ext_profile->internal();
571 auto& ext_kcol = ext_internal_profile->getKCol();
572 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
573 //m_h_ext_values.resize(ext_block_nnz) ;
574 m_ext_values.reset(new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
575 {
576 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz));
577 queue.submit([&](sycl::handler& cgh)
578 {
579 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
580 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
581 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
582 //auto access_block_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
583 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
584
585 cgh.parallel_for<class set_matrix_values3>(
586 sycl::range<1>{total_threads},
587 [=] (sycl::item<1> item_id)
588 {
589 auto id = item_id.get_id(0);
590
591 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
592 {
593 auto block_id = i/ellpack_size ;
594 auto local_id = i%ellpack_size ;
595 auto begin = access_kcol_buffer[i] ;
596 auto end = access_kcol_buffer[i+1] ;
597 auto row_size = end - begin ;
598
599 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
600 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
601
602 for(int k=begin*NxN;k<end*NxN;++k)
603 {
604 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
605 }
606 for(int k=row_size*NxN;k<block_row_size*NxN;++k)
607 {
608 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
609 }
610 }
611 });
612 }).wait() ;
613 // clang-format on
614 }
615 m_values_is_update = true;
616 }
617 }
618 else
619 {
620 if (local_row_size == nullptr)
621 {
622 assert(m_h_csr_values.size()>=nnz*NxN) ;
623 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz*NxN));
624 // COMPUTE COLS
625 // clang-format off
626 queue.submit([&](sycl::handler& cgh)
627 {
628 auto access_kcol_buffer = kcol.template get_access<sycl::access::mode::read>(cgh);
629 auto access_cols_buffer = block_cols.template get_access<sycl::access::mode::read>(cgh);
630 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
631 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
632 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
633
634 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
635 cgh.parallel_for<class set_block_matrix_values>(
636 sycl::range<1>{total_threads},
637 [=] (sycl::item<1> item_id)
638 {
639 auto id = item_id.get_id(0);
640 //auto local_id = item_id.get_local_id(0);
641 //auto block_id = item_id.get_group(0) ;
642 //auto global_id = item_id.get_global_id(0);
643
644 for (std::size_t i = id; i < nrows; i += item_id.get_range()[0])
645 {
646 auto block_id = i/pack_size ;
647 auto local_id = i%pack_size ;
648
649 int begin = access_kcol_buffer[i] ;
650 int end = access_kcol_buffer[i+1] ;
651 int row_size = end - begin ;
652
653 int block_row_offset = access_block_row_offset[block_id] ;
654 int block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
655
656 for(int k=0;k<row_size;++k)
657 {
658 for(int i=0;i<N;++i)
659 for(int j=0;j<N;++j)
660 {
661 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = access_values_buffer[(begin+k)*NxN + i*N+j] ;
662 }
663 /*
664 printf("MATRIX[%zu,%d] (%d,%d) = (%f,%f,%f,%f)\n",i,k,block_row_offset,begin, access_values_buffer[(begin+k)*NxN], access_values_buffer[(begin+k)*NxN+1], access_values_buffer[(begin+k)*NxN+2], access_values_buffer[(begin+k)*NxN +3]);*/
665
666 }
667 for(int k=row_size;k<block_row_size;++k)
668 {
669 for(int i=0;i<N;++i)
670 for(int j=0;j<N;++j)
671 {
672 access_block_values[((block_row_offset+k)*NxN + i*N+j)*pack_size+local_id] = 0. ;
673 }
674 }
675 }
676 });
677 }).wait() ;
678 // clang-format on
679 m_values_is_update = true;
680 /*
681 {
682 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(m_values);
683 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
684 for(int irow=0;irow<nrows;++irow)
685 {
686 auto ib=irow/ellpack_size ;
687 auto il=irow%ellpack_size ;
688 std::cout<<"MAT["<<irow<<","<<ib<<","<<il<<"]:";
689 for(int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
690 {
691 for(int i=0;i<N;++i)
692 for(int j=0;j<N;++j)
693 std::cout<<h_acc[(k*NxN+i*N+j)*ellpack_size+il]<<",";
694 std::cout<<std::endl;
695 }
696 }
697 }*/
698 }
699 else
700 {
701 /*
702 alien_info([&] {
703 auto h_kcol = internal_profile->kcol() ;
704 auto cols = internal_profile->cols() ;
705 for(int irow=0;irow<nrows;++irow)
706 {
707 cout()<<"ROW["<<irow<<"]:";
708 for(int k=h_kcol[irow];k<h_kcol[irow]+local_row_size[irow];++k)
709 {
710 cout()<<"\t COL "<<cols[k];
711 for(int i=0;i<N;++i)
712 for(int j=0;j<N;++j)
713 cout()<<"\t\t MAT["<<i<<" "<<j<<"] "<<m_h_csr_values[k*NxN+i*N+j];
714 }
715 }
716 });*/
717 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz*NxN));
718 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
719 // COMPUTE COLS
720 // clang-format off
721 queue.submit([&](sycl::handler& cgh)
722 {
723 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
724 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
725
726 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
727 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
728 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
729
730 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
731 cgh.parallel_for<class set_block_matrix_values2>(
732 sycl::range<1>{total_threads},
733 [=] (sycl::item<1> item_id)
734 {
735 auto id = item_id.get_id(0);
736 //auto local_id = item_id.get_local_id(0);
737 //auto block_id = item_id.get_group(0) ;
738 //auto global_id = item_id.get_global_id(0);
739
740 for (auto i = id; i < nrows; i += item_id.get_range()[0])
741 {
742 auto block_id = i/pack_size ;
743 auto local_id = i%pack_size ;
744
745 auto begin = access_kcol_buffer[i] ;
746 auto lrow_size = access_lrowsize_buffer[i] ;
747 auto end = begin + lrow_size ;
748
749 auto block_row_offset = access_block_row_offset[block_id];
750 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
751
752 for(int k=0;k<lrow_size;++k)
753 {
754 for(int i=0;i<N;++i)
755 for(int j=0;j<N;++j)
756 {
757 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = access_values_buffer[(begin+k)*NxN+i*N+j] ;
758 }
759 }
760 for(int k=lrow_size;k<block_row_size;++k)
761 {
762 for(int i=0;i<N;++i)
763 for(int j=0;j<N;++j)
764 {
765 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = 0. ;
766 }
767 }
768 }
769 });
770 });
771
772
773 auto interface_nrows = m_ext_profile->getNRows();
774 auto ext_nnz = m_ext_profile->getNnz();
775 auto ext_block_nnz = m_ext_profile->getBlockNnz();
776
777 auto ext_internal_profile = m_ext_profile->internal();
778 auto& ext_kcol = ext_internal_profile->getKCol();
779 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
780 //m_h_ext_values.resize(ext_block_nnz) ;
781 m_ext_values.reset(new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
782 {
783 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz));
784 queue.submit([&](sycl::handler& cgh)
785 {
786 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
787 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
788 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
789 //auto access_block_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
790 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
791
792 cgh.parallel_for<class set_matrix_values3>(
793 sycl::range<1>{total_threads},
794 [=] (sycl::item<1> item_id)
795 {
796 auto id = item_id.get_id(0);
797
798 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
799 {
800 auto block_id = i/pack_size ;
801 auto local_id = i%pack_size ;
802 auto begin = access_kcol_buffer[i] ;
803 auto end = access_kcol_buffer[i+1] ;
804 auto row_size = end - begin ;
805
806 int block_row_offset = access_block_row_offset[block_id] ;
807 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
808
809 for(int k=0;k<row_size;++k)
810 {
811 for(int i=0;i<N;++i)
812 for(int j=0;j<N;++j)
813 {
814 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = access_values_buffer[(begin+k)*NxN+i*N+j] ;
815 }
816 }
817 for(int k=row_size;k<block_row_size;++k)
818 {
819 for(int i=0;i<N;++i)
820 for(int j=0;j<N;++j)
821 {
822 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = 0 ;
823 }
824 }
825 }
826 });
827 }).wait() ;
828 // clang-format on
829 }
830 /*
831 {
832 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(m_values);
833 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
834 sycl::host_accessor<int, 1, sycl::access::mode::read> cols_acc(block_cols);
835 for(int irow=0;irow<nrows;++irow)
836 {
837 auto ib=irow/pack_size ;
838 auto il=irow%pack_size ;
839 alien_info([&] {
840 cout()<<"MAT["<<irow<<","<<ib<<","<<il<<"]:";
841 for(int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
842 {
843 cout()<<"\t K"<<k<<" "<<cols_acc[k*pack_size+il];
844 for(int i=0;i<N;++i)
845 for(int j=0;j<N;++j)
846 cout()<<"\t\t"<<h_acc[(k*NxN+i*N+j)*pack_size+il]<<",";
847 //std::cout<<std::endl;
848 }
849 });
850 }
851 }*/
852 m_values_is_update = true;
853 }
854 }
855 return true;
856 }
857
858 template <typename ValueT, int EllpackSize>
859 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValues(ValueBufferType& values_buffer)
860 {
861 //alien_debug([&] { cout() << "SYCLMatrix setMatrixValues "; });
862 auto env = SYCLEnv::instance();
863 auto& queue = env->internal()->queue();
864 auto num_groups = env->internal()->maxNumGroups();
865 auto max_work_group_size = env->internal()->maxWorkGroupSize();
866 auto total_threads = num_groups * ellpack_size;
867
868 int NxN = this->m_NxN;
869
870 auto nrows = m_profile->getNRows();
871 auto nnz = m_profile->getNnz();
872 auto block_nnz = m_profile->getBlockNnz();
873
874 auto internal_profile = m_profile->internal();
875 auto& kcol = internal_profile->getKCol();
876 auto& block_row_offset = internal_profile->getBlockRowOffset();
877
878 auto local_row_size = m_profile->localRowSize();
879 if (local_row_size == nullptr)
880 {
881 //ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
882 // COMPUTE COLS
883 // clang-format off
884 queue.submit([&](sycl::handler& cgh)
885 {
886 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
887 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
888 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
889 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
890
891 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
892 cgh.parallel_for<class set_matrix_values4>(sycl::range<1>{total_threads},
893 [=] (sycl::item<1> item_id)
894 {
895 auto id = item_id.get_id(0);
896 //auto local_id = item_id.get_local_id(0);
897 //auto block_id = item_id.get_group(0) ;
898 //auto global_id = item_id.get_global_id(0);
899
900 for (auto i = id; i < nrows; i += item_id.get_range()[0])
901 {
902 auto block_id = i/ellpack_size ;
903 auto local_id = i%ellpack_size ;
904
905 auto begin = access_kcol_buffer[i] ;
906 auto end = access_kcol_buffer[i+1] ;
907 auto row_size = end - begin ;
908
909 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
910 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
911
912 for(int k=begin*NxN;k<end*NxN;++k)
913 {
914 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
915 }
916 for(int k=row_size*NxN;k<block_row_size*NxN;++k)
917 {
918 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
919 }
920 }
921 });
922 }) ;
923 // clang-format on
924 m_values_is_update = true;
925 }
926 else
927 {
928 //ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
929 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
930
931 // COMPUTE COLS
932 // clang-format off
933 queue.submit([&](sycl::handler& cgh)
934 {
935 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
936 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
937
938 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
939 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
940 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
941
942 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
943 cgh.parallel_for<class set_matrix_values5>(sycl::range<1>{total_threads},
944 [=] (sycl::item<1> item_id)
945 {
946 auto id = item_id.get_id(0);
947 //auto local_id = item_id.get_local_id(0);
948 //auto block_id = item_id.get_group(0) ;
949 //auto global_id = item_id.get_global_id(0);
950
951 for (auto i = id; i < nrows; i += item_id.get_range()[0])
952 {
953 auto block_id = i/ellpack_size ;
954 auto local_id = i%ellpack_size ;
955
956 auto begin = access_kcol_buffer[i] ;
957 auto lrow_size = access_lrowsize_buffer[i] ;
958 auto end = begin + lrow_size ;
959
960 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
961 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
962
963 for(int k=begin*NxN;k<end*NxN;++k)
964 {
965 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
966 }
967 for(int k=lrow_size*NxN;k<block_row_size*NxN;++k)
968 {
969 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
970 }
971 }
972 });
973 }) ;
974 auto kcol = m_profile->kcol() ;
975 auto local_row_size = m_profile->localRowSize() ;
976
977 auto interface_nrows = m_ext_profile->getNRows();
978 auto ext_nnz = m_ext_profile->getNnz();
979 auto ext_block_nnz = m_ext_profile->getBlockNnz();
980
981 auto ext_internal_profile = m_ext_profile->internal();
982 auto& ext_kcol = ext_internal_profile->getKCol();
983 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
984 //m_h_ext_values.resize(ext_block_nnz) ;
985 m_ext_values.reset(new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
986
987 // EXTRACT EXTERNAL PROFILE
988 {
989 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz * NxN));
990
991 std::vector<Integer> h_local_row_offset(interface_nrows+1) ;
992 {
993 Integer offset = 0 ;
994 for(std::size_t i=0;i<interface_nrows;++i)
995 {
996 h_local_row_offset[i] = offset ;
997 offset += local_row_size[i] ;
998 }
999 h_local_row_offset[interface_nrows] = offset ;
1000 }
1001 IndexBufferType local_row_offset(h_local_row_offset.data(),sycl::range<1>(interface_nrows+1)) ;
1002
1003 queue.submit([&](sycl::handler& cgh)
1004 {
1005 auto access_kcol = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1006 auto access_interface_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
1007 auto access_local_row_size = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1008 auto access_local_row_offset = local_row_offset.template get_access<sycl::access::mode::read>(cgh);
1009 auto access_csr_values = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1010 auto access_ext_csr_values = ext_csr_values_buffer.template get_access<sycl::access::mode::read_write>(cgh);
1011 cgh.parallel_for<class set_matrix_values6>(sycl::range<1>{total_threads},
1012 [=] (sycl::item<1> item_id)
1013 {
1014 auto id = item_id.get_id(0);
1015 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
1016 {
1017 Integer jcol = access_local_row_offset[i] ;
1018 for (int k = (access_kcol[i] + access_local_row_size[i])*NxN; k < access_kcol[i + 1] * NxN; ++k)
1019 access_ext_csr_values[jcol++] = access_csr_values[k];
1020 }
1021 });
1022 }).wait() ;
1023
1024 queue.submit([&](sycl::handler& cgh)
1025 {
1026 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
1027 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1028 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
1029 //auto access_block_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
1030 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
1031
1032 cgh.parallel_for<class set_matrix_values7>(sycl::range<1>{total_threads},
1033 [=] (sycl::item<1> item_id)
1034 {
1035 auto id = item_id.get_id(0);
1036
1037 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
1038 {
1039 auto block_id = i/ellpack_size ;
1040 auto local_id = i%ellpack_size ;
1041 auto begin = access_kcol_buffer[i] ;
1042 auto end = access_kcol_buffer[i+1] ;
1043 auto row_size = end - begin ;
1044
1045 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1046 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1047
1048 for(int k=begin*NxN;k<end*NxN;++k)
1049 {
1050 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1051 }
1052 for(int k=row_size*NxN;k<block_row_size*NxN;++k)
1053 {
1054 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1055 }
1056 }
1057 });
1058 }).wait() ;
1059 // clang-format on
1060 }
1061 m_values_is_update = true;
1062 }
1063 return true;
1064 }
1065
1066 template <typename ValueT, int EllpackSize>
1067 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValues(ValueBufferType& values_buffer,
1068 ValueBufferType& ext_values_buffer)
1069 {
1070 //alien_debug([&] { cout() << "SYCLMatrix setMatrixValues "; });
1071 auto env = SYCLEnv::instance();
1072 auto& queue = env->internal()->queue();
1073 auto num_groups = env->internal()->maxNumGroups();
1074 auto max_work_group_size = env->internal()->maxWorkGroupSize();
1075 auto total_threads = num_groups * ellpack_size;
1076
1077 int NxN = this->m_NxN;
1078
1079 auto nrows = m_profile->getNRows();
1080 auto nnz = m_profile->getNnz();
1081 auto block_nnz = m_profile->getBlockNnz();
1082
1083 auto internal_profile = m_profile->internal();
1084 auto& kcol = internal_profile->getKCol();
1085 auto& block_row_offset = internal_profile->getBlockRowOffset();
1086
1087 auto local_row_size = m_profile->localRowSize();
1088 if (local_row_size == nullptr)
1089 {
1090 //ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
1091 // COMPUTE COLS
1092 // clang-format off
1093 queue.submit([&](sycl::handler& cgh)
1094 {
1095 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1096 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1097 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1098 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1099
1100 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
1101 cgh.parallel_for<class set_matrix_values4>(sycl::range<1>{total_threads},
1102 [=] (sycl::item<1> item_id)
1103 {
1104 auto id = item_id.get_id(0);
1105 //auto local_id = item_id.get_local_id(0);
1106 //auto block_id = item_id.get_group(0) ;
1107 //auto global_id = item_id.get_global_id(0);
1108
1109 for (auto i = id; i < nrows; i += item_id.get_range()[0])
1110 {
1111 auto block_id = i/ellpack_size ;
1112 auto local_id = i%ellpack_size ;
1113
1114 auto begin = access_kcol_buffer[i] ;
1115 auto end = access_kcol_buffer[i+1] ;
1116 auto row_size = end - begin ;
1117
1118 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1119 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1120
1121 for(int k=begin*NxN;k<end*NxN;++k)
1122 {
1123 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1124 }
1125 for(int k=row_size*NxN;k<block_row_size*NxN;++k)
1126 {
1127 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1128 }
1129 }
1130 });
1131 }) ;
1132 // clang-format on
1133 m_values_is_update = true;
1134 }
1135 else
1136 {
1137 //ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
1138 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
1139
1140 // COMPUTE COLS
1141 // clang-format off
1142 queue.submit([&](sycl::handler& cgh)
1143 {
1144 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1145 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1146
1147 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1148 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1149 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1150
1151 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
1152 cgh.parallel_for<class set_matrix_values5>(sycl::range<1>{total_threads},
1153 [=] (sycl::item<1> item_id)
1154 {
1155 auto id = item_id.get_id(0);
1156 //auto local_id = item_id.get_local_id(0);
1157 //auto block_id = item_id.get_group(0) ;
1158 //auto global_id = item_id.get_global_id(0);
1159
1160 for (auto i = id; i < nrows; i += item_id.get_range()[0])
1161 {
1162 auto block_id = i/ellpack_size ;
1163 auto local_id = i%ellpack_size ;
1164
1165 auto begin = access_kcol_buffer[i] ;
1166 auto lrow_size = access_lrowsize_buffer[i] ;
1167 auto end = begin + lrow_size ;
1168
1169 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1170 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1171
1172 for(int k=begin*NxN;k<end*NxN;++k)
1173 {
1174 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1175 }
1176 for(int k=lrow_size*NxN;k<block_row_size*NxN;++k)
1177 {
1178 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1179 }
1180 }
1181 });
1182 }) ;
1183 auto kcol = m_profile->kcol() ;
1184 auto local_row_size = m_profile->localRowSize() ;
1185
1186 auto interface_nrows = m_ext_profile->getNRows();
1187 auto ext_nnz = m_ext_profile->getNnz();
1188 auto ext_block_nnz = m_ext_profile->getBlockNnz();
1189
1190 auto ext_internal_profile = m_ext_profile->internal();
1191 auto& ext_kcol = ext_internal_profile->getKCol();
1192 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
1193 //m_h_ext_values.resize(ext_block_nnz) ;
1194 m_ext_values.reset(new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
1195
1196 // EXTRACT EXTERNAL PROFILE
1197 {
1198 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz * NxN));
1199
1200 std::vector<Integer> h_local_row_offset(interface_nrows+1) ;
1201 {
1202 Integer offset = 0 ;
1203 for(std::size_t i=0;i<interface_nrows;++i)
1204 {
1205 h_local_row_offset[i] = offset ;
1206 offset += local_row_size[i] ;
1207 }
1208 h_local_row_offset[interface_nrows] = offset ;
1209 }
1210 IndexBufferType local_row_offset(h_local_row_offset.data(),sycl::range<1>(interface_nrows+1)) ;
1211
1212 queue.submit([&](sycl::handler& cgh)
1213 {
1214 auto access_kcol = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1215 auto access_interface_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
1216 auto access_local_row_size = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1217 auto access_local_row_offset = local_row_offset.template get_access<sycl::access::mode::read>(cgh);
1218 auto access_csr_values = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1219 auto access_ext_csr_values = ext_csr_values_buffer.template get_access<sycl::access::mode::read_write>(cgh);
1220 cgh.parallel_for<class set_matrix_values6>(sycl::range<1>{total_threads},
1221 [=] (sycl::item<1> item_id)
1222 {
1223 auto id = item_id.get_id(0);
1224 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
1225 {
1226 Integer jcol = access_local_row_offset[i] ;
1227 for (int k = (access_kcol[i] + access_local_row_size[i])*NxN; k < access_kcol[i + 1] * NxN; ++k)
1228 access_ext_csr_values[jcol++] = access_csr_values[k];
1229 }
1230 });
1231 }).wait() ;
1232
1233 queue.submit([&](sycl::handler& cgh)
1234 {
1235 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
1236 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1237 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
1238 //auto access_block_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
1239 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
1240
1241 cgh.parallel_for<class set_matrix_values7>(sycl::range<1>{total_threads},
1242 [=] (sycl::item<1> item_id)
1243 {
1244 auto id = item_id.get_id(0);
1245
1246 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
1247 {
1248 auto block_id = i/ellpack_size ;
1249 auto local_id = i%ellpack_size ;
1250 auto begin = access_kcol_buffer[i] ;
1251 auto end = access_kcol_buffer[i+1] ;
1252 auto row_size = end - begin ;
1253
1254 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1255 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1256
1257 for(int k=begin*NxN;k<end*NxN;++k)
1258 {
1259 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1260 }
1261 for(int k=row_size*NxN;k<block_row_size*NxN;++k)
1262 {
1263 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1264 }
1265 }
1266 });
1267 }).wait() ;
1268 // clang-format on
1269 }
1270 m_values_is_update = true;
1271 }
1272 return true;
1273 }
1274
1275 template <typename ValueT, int EllpackSize>
1276 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValues(ValueT const* values, bool only_host)
1277 {
1278 //alien_debug([&] { cout() << "SYCLMatrix setMatrixValues " << only_host; });
1279
1280 int NxN = this->m_NxN;
1281
1282 auto nnz = m_profile->getNnz();
1283 m_h_csr_values.resize(nnz*NxN);
1284 std::copy(values, values + nnz*NxN, m_h_csr_values.begin());
1285 if (m_ext_profile) {
1286 // clang-format off
1287 auto kcol = m_profile->kcol() ;
1288 auto local_row_size = m_profile->localRowSize() ;
1289
1290 auto interface_nrows = m_ext_profile->getNRows() ;
1291 auto ext_nnz = m_ext_profile->getNnz();
1292 // clang-format on
1293
1294 m_h_csr_ext_values.resize(ext_nnz*NxN);
1295
1296 // EXTRACT EXTERNAL PROFILE
1297 {
1298 int jcol = 0;
1299 for (std::size_t i = 0; i < interface_nrows; ++i) {
1300 auto id = m_h_interface_row_ids[i];
1301 for (int k = (kcol[id] + local_row_size[id])*NxN; k < kcol[id + 1]*NxN; ++k) {
1302 m_h_csr_ext_values[jcol++] = values[k];
1303 }
1304 }
1305 }
1306 }
1307 if (only_host) {
1308 m_values_is_update = false;
1309 }
1310 else {
1311 setMatrixValuesFromHost();
1312 m_values_is_update = true;
1313 }
1314 return true;
1315 }
1316
1317 template <typename ValueT, int EllpackSize>
1318 bool MatrixInternal<ValueT, EllpackSize>::copy(std::size_t nb_blocks,
1319 int block_size,
1320 ValueBufferType& values_buffer,
1321 int rhs_block_size)
1322 {
1323 auto env = SYCLEnv::instance();
1324 auto& queue = env->internal()->queue();
1325 auto num_groups = env->internal()->maxNumGroups();
1326 auto max_work_group_size = env->internal()->maxWorkGroupSize();
1327 auto total_threads = num_groups * ellpack_size;
1328
1329 int pack_size = ellpack_size;
1330 int N1 = block_size ;
1331 int N1xN1 = N1*N1 ;
1332 int N2 = rhs_block_size ;
1333 int N2xN2 = N2*N2 ;
1334 auto nrows = m_profile->getNRows();
1335 auto nnz = m_profile->getNnz();
1336 auto block_nnz = m_profile->getBlockNnz();
1337
1338 auto internal_profile = m_profile->internal();
1339 auto& kcol = internal_profile->getKCol();
1340 auto& block_row_offset = internal_profile->getBlockRowOffset();
1341
1342 auto local_row_size = m_profile->localRowSize();
1343 if (local_row_size == nullptr)
1344 {
1345 //ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
1346 // COMPUTE COLS
1347 // clang-format off
1348 queue.submit([&](sycl::handler& cgh)
1349 {
1350 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1351 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1352 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1353 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1354
1355 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
1356 cgh.parallel_for<class set_matrix_values4>(
1357 sycl::range<1>{total_threads},
1358 [=] (sycl::item<1> item_id)
1359 {
1360 auto id = item_id.get_id(0);
1361 //auto local_id = item_id.get_local_id(0);
1362 //auto block_id = item_id.get_group(0) ;
1363 //auto global_id = item_id.get_global_id(0);
1364
1365 for (auto i = id; i < nrows; i += item_id.get_range()[0])
1366 {
1367 auto block_id = i/pack_size ;
1368 auto local_id = i%pack_size ;
1369
1370 auto begin = access_block_row_offset[block_id] ;
1371 auto end = access_block_row_offset[block_id+1] ;
1372
1373 for(int k=begin;k<end;++k)
1374 {
1375 for(int i=0;i<N1;++i)
1376 for(int j=0;j<N1;++j)
1377 access_block_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = access_values_buffer[(k*N2xN2+i*N2+j)*pack_size+local_id] ;
1378 }
1379 }
1380 });
1381 }) ;
1382 // clang-format on
1383 }
1384 else
1385 {
1386 //ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
1387 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
1388
1389 // COMPUTE COLS
1390 // clang-format off
1391 queue.submit([&](sycl::handler& cgh)
1392 {
1393 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1394 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1395
1396 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1397 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1398 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1399
1400 //cgh.parallel_for<class vector_axpy>(sycl::nd_range<1>{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}},[=](sycl::nd_item<1> item_id)
1401 cgh.parallel_for<class set_matrix_values5>(
1402 sycl::range<1>{total_threads},
1403 [=] (sycl::item<1> item_id)
1404 {
1405 auto id = item_id.get_id(0);
1406 //auto local_id = item_id.get_local_id(0);
1407 //auto block_id = item_id.get_group(0) ;
1408 //auto global_id = item_id.get_global_id(0);
1409
1410 for (auto i = id; i < nrows; i += item_id.get_range()[0])
1411 {
1412 auto block_id = i/pack_size ;
1413 auto local_id = i%pack_size ;
1414
1415
1416 int block_row_offset = access_block_row_offset[block_id] ;
1417
1418 auto begin = access_block_row_offset[block_id] ;
1419 auto end = access_block_row_offset[block_id+1] ;
1420 auto lrow_size = access_lrowsize_buffer[i] ;
1421
1422 for(int k=begin;k<begin+lrow_size;++k)
1423 {
1424 for(int i=0;i<N1;++i)
1425 for(int j=0;j<N1;++j)
1426 access_block_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = access_values_buffer[(k*N2xN2+i*N2+j)*pack_size+local_id] ;
1427 }
1428 for(int k=begin+lrow_size;k<end;++k)
1429 {
1430 for(int i=0;i<N1;++i)
1431 for(int j=0;j<N1;++j)
1432 access_block_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = 0 ;
1433 }
1434 }
1435 });
1436 }) ;
1437 // clang-format on
1438 }
1439 m_values_is_update = true;
1440 return true;
1441 }
1442
1443 template <typename ValueT, int EllpackSize>
1444 bool MatrixInternal<ValueT, EllpackSize>::copy(std::size_t nb_blocks,
1445 int block_size,
1446 ValueBufferType& values_buffer,
1447 ValueBufferType& ext_values_buffer,
1448 int rhs_block_size)
1449 {
1450 copy(nb_blocks,block_size,values_buffer,rhs_block_size) ;
1451
1452 auto env = SYCLEnv::instance();
1453 auto& queue = env->internal()->queue();
1454 auto num_groups = env->internal()->maxNumGroups();
1455 auto max_work_group_size = env->internal()->maxWorkGroupSize();
1456 auto total_threads = num_groups * ellpack_size;
1457
1458 int pack_size = ellpack_size;
1459 int N1 = block_size ;
1460 int N1xN1 = N1*N1 ;
1461 int N2 = rhs_block_size ;
1462 int N2xN2 = N2*N2 ;
1463 auto nrows = m_profile->getNRows();
1464 auto nnz = m_profile->getNnz();
1465 auto block_nnz = m_profile->getBlockNnz();
1466
1467 auto internal_profile = m_profile->internal();
1468 auto& kcol = internal_profile->getKCol();
1469 auto& block_row_offset = internal_profile->getBlockRowOffset();
1470
1471 assert(m_ext_profile) ;
1472 auto interface_nrows = m_ext_profile->getNRows();
1473 auto ext_nnz = m_ext_profile->getNnz();
1474 auto ext_block_nnz = m_ext_profile->getBlockNnz();
1475 auto ext_internal_profile = m_ext_profile->internal();
1476 auto& ext_kcol = ext_internal_profile->getKCol();
1477 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
1478
1479 m_h_ext_values.resize(ext_block_nnz*N1xN1) ;
1480 m_ext_values.reset(new ValueBufferType(ext_block_nnz * N1xN1* ellpack_size)) ;
1481 {
1482 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz*N1xN1));
1483 queue.submit([&](sycl::handler& cgh)
1484 {
1485 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1486 auto access_ext_values_buffer = ext_values_buffer.template get_access<sycl::access::mode::read>(cgh);
1487 auto access_ext_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
1488
1489 cgh.parallel_for<class set_matrix_ext_values3>(
1490 sycl::range<1>{total_threads},
1491 [=] (sycl::item<1> item_id)
1492 {
1493 auto id = item_id.get_id(0);
1494 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
1495 {
1496 auto block_id = i/pack_size ;
1497 auto local_id = i%pack_size ;
1498
1499 auto begin = access_block_row_offset[block_id] ;
1500 auto end = access_block_row_offset[block_id+1] ;
1501
1502 for(int k=begin;k<begin;++k)
1503 {
1504 for(int i=0;i<N1;++i)
1505 for(int j=0;j<N1;++j)
1506 access_ext_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = access_ext_values_buffer[(k*N2xN2+i*N2+j)*pack_size+local_id] ;
1507 }
1508 }
1509 });
1510 }) ;
1511 // clang-format on
1512 }
1513 m_values_is_update = true;
1514 return true;
1515 }
1516
1517
1518 template <typename ValueT, int EllpackSize>
1519 bool MatrixInternal<ValueT, EllpackSize>::needUpdate()
1520 {
1521 return m_values_is_update != true;
1522 }
1523
1524 template <typename ValueT, int EllpackSize>
1525 void MatrixInternal<ValueT, EllpackSize>::notifyChanges()
1526 {
1527 m_values_is_update = false;
1528 }
1529
1530 template <typename ValueT, int EllpackSize>
1531 void MatrixInternal<ValueT, EllpackSize>::endUpdate()
1532 {
1533 if (not m_values_is_update) {
1534 setMatrixValuesFromHost();
1535 }
1536 }
1537
1538 /*
1539 template<int N>
1540 template <typename ValueT, int EllPackSize>
1541 void MatrixInternal<ValueT, EllPackSize>::multN(ValueBufferType& x, ValueBufferType& y, sycl::queue& queue) const
1542 {
1543 auto device = queue.get_device();
1544
1545 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1546 // getting the maximum work group size per thread
1547 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1548 // building the best number of global thread
1549
1550 // clang-format off
1551 std::size_t pack_size = ellpack_size;
1552 auto nrows = m_profile->getNRows();
1553 auto nnz = m_profile->getNnz();
1554
1555 auto internal_profile = m_profile->internal();
1556 auto& kcol = internal_profile->getKCol();
1557 auto& block_row_offset = internal_profile->getBlockRowOffset();
1558 auto& block_cols = internal_profile->getBlockCols();
1559
1560 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
1561 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
1562 auto total_threads = blocks_target * pack_size;
1563
1564 queue.submit(
1565 [&](sycl::handler& cgh)
1566 {
1567 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1568 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1569 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
1570
1571 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1572 auto access_y = y.template get_access<sycl::access::mode::discard_write>(cgh);
1573
1574 auto tile = TileT<N>() ;
1575
1576 sycl::local_accessor<ValueType, 1> lds_x{pack_size*N, cgh};
1577 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
1578 cgh.parallel_for<class compute_mult>(r,
1579 [=](sycl::nd_item<1> item_id)
1580 {
1581 auto local_id = item_id.get_local_id(0);
1582 auto global_id = item_id.get_global_id(0);
1583
1584 for (auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
1585 {
1586 auto block_id = i/pack_size ;
1587
1588 int begin = access_block_row_offset[block_id] ;
1589 int end = access_block_row_offset[block_id+1] ;
1590 for(int ieq=0;ieq<N;++ieq)
1591 {
1592 ValueType value = 0. ;
1593 for(int k=begin;k<end;++k)
1594 {
1595 //auto k = block_row_offset+j*ellpack_size+local_id ;
1596 const int col = access_cols[k * pack_size + local_id];
1597 if(col>=0)
1598 for(int ju=0;ju<N;++ju)
1599 lds_x[N*local_id+ju] = access_x[col*N+ju];
1600 item_id.barrier(sycl::access::fence_space::local_space);
1601 if(col>=0)
1602 {
1603 for(int ju=0;ju<N;++ju)
1604 value += access_values[tile.ijk(k,ieq,ju) + local_id] * lds_x[local_id*N+ju] ;
1605 }
1606 item_id.barrier(sycl::access::fence_space::local_space);
1607 }
1608 access_y[i*N+ieq] = value ;
1609 }
1610 }
1611 });
1612 });
1613 }*/
1614
1615 template <typename ValueT, int EllPackSize>
1616 void MatrixInternal<ValueT, EllPackSize>::mult(ValueBufferType& x, ValueBufferType& y, sycl::queue& queue) const
1617 {
1618 // clang-format on
1619 if(this->m_N==1)
1620 {
1621 auto device = queue.get_device();
1622
1623 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1624 // getting the maximum work group size per thread
1625 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1626 // building the best number of global thread
1627 //auto total_threads = num_groups * ellpack_size;
1628
1629 // clang-format off
1630 //int NxN = this->m_NxN;
1631 std::size_t pack_size = ellpack_size;
1632 auto nrows = m_profile->getNRows();
1633 auto nnz = m_profile->getNnz();
1634
1635 auto internal_profile = m_profile->internal();
1636 auto& kcol = internal_profile->getKCol();
1637 auto& block_row_offset = internal_profile->getBlockRowOffset();
1638 auto& block_cols = internal_profile->getBlockCols();
1639
1640 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
1641 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
1642 auto total_threads = blocks_target * ellpack_size;
1643
1644 // COMPUTE VALUES
1645 // clang-format off
1646 queue.submit([&](sycl::handler& cgh)
1647 {
1648 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1649 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1650 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
1651
1652
1653 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1654 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
1655#ifdef OLD
1656 //sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}};
1657 //cgh.parallel_for<class compute_mult>(r, [&](sycl::nd_item<1> item_id)
1658 cgh.parallel_for<class compute_mult>(
1659 sycl::range<1>{total_threads},
1660 [=] (sycl::item<1> item_id)
1661 {
1662 auto id = item_id.get_id(0);
1663 //auto local_id = item_id.get_local_id(0);
1664 //auto block_id = item_id.get_group(0) ;
1665 //auto global_id = item_id.get_global_id(0);
1666
1667 for (auto i = id; i < nrows; i += item_id.get_range()[0])
1668 {
1669 auto block_id = i/ellpack_size ;
1670 auto local_id = i%ellpack_size ;
1671 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
1672 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1673
1674 ValueType value = 0. ;
1675 for(int j=0;j<block_row_size;++j)
1676 {
1677 auto k = block_row_offset+j*ellpack_size+local_id ;
1678 value += access_values[k]* access_x[access_cols[k]] ;
1679 }
1680 access_y[i] = value ;
1681 }
1682 });
1683
1684#endif
1685 sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
1686 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
1687 cgh.parallel_for<class compute_mult>(r,
1688 [=](sycl::nd_item<1> item_id)
1689 {
1690 //auto id = item_id.get_id(0);
1691 auto local_id = item_id.get_local_id(0);
1692 //auto block_id = item_id.get_group(0) ;
1693 auto global_id = item_id.get_global_id(0);
1694
1695 //for (auto i = id; i < nrows; i += item_id.get_range()[0])
1696 for (auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
1697 {
1698 auto block_id = i/pack_size ;
1699 //auto local_id = i%ellpack_size ;
1700
1701 int begin = access_block_row_offset[block_id] ;
1702 int end = access_block_row_offset[block_id+1] ;
1703 ValueType value = 0. ;
1704 for(int k=begin;k<end;++k)
1705 {
1706 //auto k = block_row_offset+j*ellpack_size+local_id ;
1707 const int col = access_cols[k * pack_size + local_id];
1708 if(col>=0)
1709 lds_x[local_id] = access_x[col];
1710 item_id.barrier(sycl::access::fence_space::local_space);
1711 if(col>=0)
1712 value += access_values[k * pack_size + local_id] * lds_x[local_id] ;
1713 item_id.barrier(sycl::access::fence_space::local_space);
1714 }
1715 access_y[i] = value ;
1716 }
1717 });
1718 });
1719 // clang-format on
1720 }
1721 else
1722 {
1723 switch(this->m_N)
1724 {
1725 case 2:
1726 multN<2>(x,y,queue) ;
1727 break ;
1728 case 3:
1729 multN<3>(x,y,queue) ;
1730 break;
1731 case 4:
1732 multN<4>(x,y,queue) ;
1733 break;
1734 default:
1735 auto device = queue.get_device();
1736
1737 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1738 // getting the maximum work group size per thread
1739 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1740 // building the best number of global thread
1741 auto total_threads = num_groups * ellpack_size;
1742
1743 // clang-format off
1744 int N = this->m_N;
1745 //int NxN = this->m_NxN;
1746 std::size_t pack_size = ellpack_size;
1747 auto nrows = m_profile->getNRows();
1748 auto nnz = m_profile->getNnz();
1749
1750 auto internal_profile = m_profile->internal();
1751 auto& kcol = internal_profile->getKCol();
1752 auto& block_row_offset = internal_profile->getBlockRowOffset();
1753 auto& block_cols = internal_profile->getBlockCols();
1754
1755 queue.submit(
1756 [&](sycl::handler& cgh)
1757 {
1758 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1759 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1760 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
1761
1762
1763 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1764 auto access_y = y.template get_access<sycl::access::mode::discard_write>(cgh);
1765
1766 //sycl::local_accessor<ValueT,1> local_mem(sycl::range<1>(ellpack_size*N), cgh);
1767 //sycl::nd_range<1> range{sycl::range<1>{total_threads},sycl::range<1>{ellpack_size}};
1768 sycl::range<1> range{total_threads} ;
1769 auto tile = Tile(N) ;
1770 //cgh.parallel_for<class compute_mult>(r, [&](sycl::nd_item<1> item_id)
1771 cgh.parallel_for<class compute_block_mult>(range,
1772 [=] (sycl::item<1> item_id /*sycl::nd_item<1> item_id*/)
1773 {
1774 auto id = item_id.get_id(0);
1775 //auto local_id = item_id.get_local_id(0);
1776 //auto block_id = item_id.get_group(0) ;
1777 //auto global_id = item_id.get_global_id(0);
1778
1779
1780
1781 for (auto i = id; i < nrows; i += item_id.get_range()[0] /*item_id.get_local_range(0)*/)
1782 {
1783 auto block_id = i/pack_size ;
1784 auto local_id = i%pack_size ;
1785
1786 std::size_t block_row_offset = access_block_row_offset[block_id] ;
1787 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1788
1789 for(int ieq=0;ieq<N;++ieq)
1790 {
1791 ValueType value = 0. ;
1792 for(std::size_t k=block_row_offset;k<block_row_offset+block_row_size;++k)
1793 {
1794 // value += access_values[k]* access_x[access_cols[k]] ;
1795 value += tile.mult(ieq,
1796 local_id,
1797 k,
1798 access_cols,
1799 access_values,
1800 access_x) ;
1801 }
1802 access_y[i*N+ieq] = value;
1803 }
1804 }
1805 });
1806 });
1807 }
1808 /*
1809 {
1810 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> x_acc(x);
1811 for(int il=0;il<nrows;++il)
1812 {
1813 std::cout<<"X["<<il<<"]:";
1814 for(int i=0;i<N;++i)
1815 std::cout<<x_acc[il*N+i]<<",";
1816 std::cout<<std::endl;
1817 }
1818 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> y_acc(y);
1819 for(int il=0;il<nrows;++il)
1820 {
1821 std::cout<<"Y["<<il<<"]:";
1822 for(int i=0;i<N;++i)
1823 std::cout<<y_acc[il*N+i]<<",";
1824 std::cout<<std::endl;
1825 }
1826
1827 auto tile = Tile(N) ;
1828 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> matrix_acc(m_values);
1829 sycl::host_accessor<int, 1, sycl::access::mode::read> cols_acc(block_cols);
1830 sycl::host_accessor<int, 1, sycl::access::mode::read> brow_offset_acc(block_row_offset);
1831 for(std::size_t irow=0;irow<nrows;++irow)
1832 {
1833 auto ib = irow/ellpack_size ;
1834 auto il = irow%ellpack_size ;
1835 for(int ieq=0;ieq<N;++ieq)
1836 {
1837 std::cout<<"LINE["<<il<<","<<ieq<<"] : ";
1838 ValueType value = 0. ;
1839 for(std::size_t k=brow_offset_acc[ib];k<brow_offset_acc[ib+1];++k)
1840 {
1841 // value += access_values[k]* access_x[access_cols[k]] ;
1842 value += tile.mult(ieq,
1843 il,
1844 k,
1845 cols_acc,
1846 matrix_acc,
1847 x_acc) ;
1848 }
1849 std::cout<<"\nY_CPU["<<irow<<","<<ieq<<"]:"<<value<<std::endl;
1850 }
1851 }
1852 }*/
1853 // clang-format on
1854 }
1855 }
1856
1857 template <typename ValueT, int EllPackSize>
1858 void MatrixInternal<ValueT, EllPackSize>::mult(ValueBufferType& x, ValueBufferType& y) const
1859 {
1860 this->mult(x, y, SYCLEnv::instance()->internal()->queue());
1861 }
1862
1863 template <typename ValueT, int EllPackSize>
1864 void MatrixInternal<ValueT, EllPackSize>::addExtMult(ValueBufferType& x,
1865 ValueBufferType& y,
1866 sycl::queue& queue) const
1867 {
1868 auto device = queue.get_device();
1869
1870 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1871 // getting the maximum work group size per thread
1872 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1873 // building the best number of global thread
1874 auto total_threads = num_groups * ellpack_size;
1875
1876 // clang-format off
1877 auto nrows = m_profile->getNRows();
1878 auto nnz = m_profile->getNnz();
1879
1880 auto interface_nrow = m_ext_profile->getNRows();
1881 auto ext_profile = m_ext_profile->internal();
1882 auto& kcol = ext_profile->getKCol();
1883 auto& block_row_offset = ext_profile->getBlockRowOffset();
1884 auto& block_cols = ext_profile->getBlockCols();
1885 // clang-format on
1886
1887 {
1888 // clang-format off
1889 queue.submit([&](sycl::handler& cgh)
1890 {
1891 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1892 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1893 auto access_values = m_ext_values->template get_access<sycl::access::mode::read>(cgh);
1894
1895 auto access_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
1896
1897 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1898 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
1899
1900 cgh.parallel_for<class compute_ext_mult>(
1901 sycl::range<1>{total_threads},
1902 [=] (sycl::item<1> item_id)
1903 {
1904 auto id = item_id.get_id(0);
1905
1906 for (auto i = id; i < interface_nrow; i += item_id.get_range()[0])
1907 {
1908 auto block_id = i/ellpack_size ;
1909 auto local_id = i%ellpack_size ;
1910
1911 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
1912 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1913
1914 ValueType value = 0. ;
1915 for(int j=0;j<block_row_size;++j)
1916 {
1917 auto k = block_row_offset+j*ellpack_size+local_id ;
1918 value += access_values[k]* access_x[access_cols[k]] ;
1919 }
1920 access_y[access_row_ids[i]] += value ;
1921 }
1922 });
1923 });
1924 // clang-format on
1925 }
1926 }
1927 template <typename ValueT, int EllPackSize>
1928 void MatrixInternal<ValueT, EllPackSize>::addExtMult(ValueBufferType& x,
1929 ValueBufferType& y) const
1930 {
1931 this->addExtMult(x, y, SYCLEnv::instance()->internal()->queue());
1932 }
1933
1934 template <typename ValueT, int EllPackSize>
1935 void MatrixInternal<ValueT, EllPackSize>::addLMult(ValueType alpha,
1936 ValueBufferType& x,
1937 ValueBufferType& y,
1938 sycl::queue& queue) const
1939 {
1940
1941 if(this->m_N==1)
1942 {
1943 auto device = queue.get_device();
1944
1945 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1946 // getting the maximum work group size per thread
1947 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1948
1949 std::size_t pack_size = ellpack_size;
1950 auto nrows = m_profile->getNRows();
1951 auto nnz = m_profile->getNnz();
1952
1953 auto internal_profile = m_profile->internal();
1954 auto& kcol = internal_profile->getKCol();
1955 auto& block_row_offset = internal_profile->getBlockRowOffset();
1956 auto& block_cols = internal_profile->getBlockCols();
1957
1958 auto& mask = internal_profile->getLowerMask();
1959
1960 // clang-format off
1961 queue.submit(
1962 [&](sycl::handler& cgh)
1963 {
1964 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1965 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1966 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
1967 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
1968
1969
1970 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1971 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
1972
1973 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
1974 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
1975 auto total_threads = blocks_target * ellpack_size;
1976
1977 sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
1978 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
1979 //cgh.parallel_for<class compute_lmult>(sycl::range<1>{total_threads},[=] (sycl::item<1> item_id)
1980 cgh.parallel_for<class compute_lmult>(r,
1981 [=](sycl::nd_item<1> item_id)
1982 {
1983 //auto id = item_id.get_id(0);
1984 auto local_id = item_id.get_local_id(0);
1985 //auto block_id = item_id.get_group(0) ;
1986 auto global_id = item_id.get_global_id(0);
1987
1988 //for (auto i = id; i < nrows; i += item_id.get_range()[0])
1989 for (auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
1990 {
1991 auto block_id = i/pack_size ;
1992 //auto local_id = i%ellpack_size ;
1993
1994 int begin = access_block_row_offset[block_id] ;
1995 int end = access_block_row_offset[block_id+1] ;
1996
1997 ValueType value = access_y[i] ;
1998 for(int k=begin;k<end;++k)
1999 {
2000 //auto k = block_row_offset+j*ellpack_size+local_id ;
2001 const int col = access_cols[k * pack_size + local_id];
2002 if(col>=0)
2003 lds_x[local_id] = access_x[col];
2004 item_id.barrier(sycl::access::fence_space::local_space);
2005 if(access_mask[k * pack_size + local_id])
2006 value += alpha * access_values[k * pack_size + local_id] * lds_x[local_id] ;
2007 item_id.barrier(sycl::access::fence_space::local_space);
2008 }
2009 access_y[i] = value ;
2010 }
2011 });
2012 });
2013 }
2014 else
2015 {
2016 switch(this->m_N)
2017 {
2018 case 2:
2019 addLMultN<2>(alpha,x,y,queue);
2020 break;
2021 case 3:
2022 addLMultN<3>(alpha,x,y,queue);
2023 break;
2024 case 4:
2025 addLMultN<4>(alpha,x,y,queue);
2026 break;
2027 default:
2028 {
2029 auto device = queue.get_device();
2030
2031 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2032 // getting the maximum work group size per thread
2033 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2034 auto total_threads = num_groups * ellpack_size;
2035
2036 int N = this->m_N;
2037 int NxN = this->m_NxN;
2038 std::size_t pack_size = ellpack_size;
2039 auto nrows = m_profile->getNRows();
2040 auto nnz = m_profile->getNnz();
2041
2042 auto internal_profile = m_profile->internal();
2043 auto& kcol = internal_profile->getKCol();
2044 auto& block_row_offset = internal_profile->getBlockRowOffset();
2045 auto& block_cols = internal_profile->getBlockCols();
2046
2047 auto& mask = internal_profile->getLowerMask();
2048
2049 // clang-format off
2050 queue.submit(
2051 [&](sycl::handler& cgh)
2052 {
2053 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2054 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2055 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
2056 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2057
2058 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
2059 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2060 sycl::range<1> range{total_threads} ;
2061 auto tile = Tile(N) ;
2062 cgh.parallel_for<class compute_block_lmult>(
2063 sycl::range<1>{total_threads},
2064 [=] (sycl::item<1> item_id)
2065 {
2066 auto id = item_id.get_id(0);
2067 //auto local_id = item_id.get_local_id(0);
2068 //auto block_id = item_id.get_group(0) ;
2069 //auto global_id = item_id.get_global_id(0);
2070
2071 for (auto i = id; i < nrows; i += item_id.get_range()[0])
2072 {
2073 auto block_id = i/ellpack_size ;
2074 auto local_id = i%ellpack_size ;
2075
2076 int block_row_offset = access_block_row_offset[block_id] ;
2077 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2078
2079 for(int ieq=0;ieq<N;++ieq)
2080 {
2081 ValueType value = 0. ;
2082 for(std::size_t k=block_row_offset;k<block_row_offset+block_row_size;++k)
2083 {
2084 // value += access_values[k]* access_x[access_cols[k]] ;
2085 value += tile.mult(ieq,
2086 local_id,
2087 k,
2088 access_cols,
2089 access_mask,
2090 access_values,
2091 access_x) ;
2092 }
2093 access_y[i*N+ieq] += alpha*value;
2094 }
2095 }
2096 });
2097 });
2098 }
2099 break;
2100 }
2101 }
2102
2103#ifdef PRINT_DEBUG_INFO
2104 {
2105 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(m_values);
2106 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2107 sycl::host_accessor<uint8_t, 1, sycl::access::mode::read> mask_acc(mask) ;
2108 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> x_acc(x);
2109 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> y_acc(y);
2110 for(int irow=0;irow<nrows;++irow)
2111 {
2112 std::cout<<"XK["<<irow<<"]=\n";
2113 for(int i=0;i<N;++i)
2114 std::cout<<x_acc[irow*N+i]<<"\n";
2115 }
2116 for(int irow=0;irow<nrows;++irow)
2117 {
2118 auto ib=irow/ellpack_size ;
2119 auto il=irow%ellpack_size ;
2120 std::cout<<"LSOLVE MAT["<<irow<<"]:"<<std::endl;
2121 for(int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2122 {
2123 if(mask_acc[k*ellpack_size+il])
2124 {
2125 for(int i=0;i<N;++i)
2126 {
2127 for(int j=0;j<N;++j)
2128 std::cout<<h_acc[(k*NxN+i*N+j)*ellpack_size+il]<<" ";
2129 std::cout<<std::endl;
2130 }
2131 }
2132 }
2133 std::cout<<"Y ="<<std::endl;
2134 for(int i=0;i<N;++i)
2135 std::cout<<y_acc[irow*N+i]<<"\n";
2136 }
2137 }
2138#endif
2139 }
2140
2141 template <typename ValueT, int EllPackSize>
2142 void MatrixInternal<ValueT, EllPackSize>::addLMult(ValueType alpha, ValueBufferType& x, ValueBufferType& y) const
2143 {
2144 this->addLMult(alpha, x, y, SYCLEnv::instance()->internal()->queue());
2145 }
2146
2147 template <typename ValueT, int EllPackSize>
2148 void MatrixInternal<ValueT, EllPackSize>::addUMult(ValueType alpha,
2149 ValueBufferType& x,
2150 ValueBufferType& y,
2151 sycl::queue& queue) const
2152 {
2153 if(this->m_N==1)
2154 {
2155 auto device = queue.get_device();
2156
2157 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2158 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2159
2160 // clang-format off
2161 int N = this->m_N;
2162 int NxN = this->m_NxN;
2163 std::size_t pack_size = ellpack_size;
2164 auto nrows = m_profile->getNRows() ;
2165 auto nnz = m_profile->getNnz() ;
2166
2167 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
2168 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
2169 auto total_threads = blocks_target * ellpack_size;
2170
2171 auto internal_profile = m_profile->internal() ;
2172 auto& kcol = internal_profile->getKCol() ;
2173 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2174 auto& block_cols = internal_profile->getBlockCols() ;
2175 auto& mask = internal_profile->getUpperMask() ;
2176
2177 // COMPUTE VALUES
2178 queue.submit(
2179 [&](sycl::handler& cgh)
2180 {
2181 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2182 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2183 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
2184 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2185
2186
2187 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
2188 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2189
2190 sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
2191 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
2192 //cgh.parallel_for<class compute_umult>(sycl::range<1>{total_threads},[=] (sycl::item<1> item_id)
2193 cgh.parallel_for<class compute_umult>(r,
2194 [=](sycl::nd_item<1> item_id)
2195 {
2196 //auto id = item_id.get_id(0);
2197 auto local_id = item_id.get_local_id(0);
2198 //auto block_id = item_id.get_group(0) ;
2199 auto global_id = item_id.get_global_id(0);
2200
2201 for (auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
2202 {
2203 auto block_id = i/pack_size ;
2204 //auto local_id = i%ellpack_size ;
2205
2206 auto begin = access_block_row_offset[block_id] ;
2207 auto end = access_block_row_offset[block_id+1] ;
2208 //auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2209
2210 ValueType value = access_y[i] ;
2211 for(int k=begin;k<end;++k)
2212 {
2213 //auto k = block_row_offset+j*ellpack_size+local_id ;
2214 //value += alpha * access_mask[k] * access_values[k]* access_x[access_cols[k]] ;
2215 const int col = access_cols[k * pack_size + local_id];
2216 if(col>=0)
2217 lds_x[local_id] = access_x[col];
2218 item_id.barrier(sycl::access::fence_space::local_space);
2219 if(access_mask[k * pack_size + local_id])
2220 value += alpha * access_values[k * pack_size + local_id] * lds_x[local_id] ;
2221 item_id.barrier(sycl::access::fence_space::local_space);
2222 }
2223 access_y[i] = value ;
2224 }
2225 });
2226 });
2227 }
2228 else
2229 {
2230 switch(this->m_N)
2231 {
2232 case 2:
2233 addUMultN<2>(alpha,x,y,queue);
2234 break;
2235 case 3:
2236 addUMultN<3>(alpha,x,y,queue);
2237 break;
2238 case 4:
2239 addUMultN<4>(alpha,x,y,queue);
2240 break;
2241 default:
2242 {
2243 auto device = queue.get_device();
2244
2245 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2246 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2247 auto total_threads = num_groups * ellpack_size;
2248
2249 // clang-format off
2250 int N = this->m_N;
2251 int NxN = this->m_NxN;
2252 std::size_t pack_size = ellpack_size;
2253 auto nrows = m_profile->getNRows() ;
2254 auto nnz = m_profile->getNnz() ;
2255
2256 auto internal_profile = m_profile->internal() ;
2257 auto& kcol = internal_profile->getKCol() ;
2258 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2259 auto& block_cols = internal_profile->getBlockCols() ;
2260 auto& mask = internal_profile->getUpperMask() ;
2261
2262 // COMPUTE VALUES
2263 queue.submit(
2264 [&](sycl::handler& cgh)
2265 {
2266 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2267 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2268 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
2269 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2270
2271
2272 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
2273 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2274
2275 auto tile = Tile(N) ;
2276 cgh.parallel_for<class compute_block_umult>(
2277 sycl::range<1>{total_threads},
2278 [=] (sycl::item<1> item_id)
2279 {
2280 auto id = item_id.get_id(0);
2281 //auto local_id = item_id.get_local_id(0);
2282 //auto block_id = item_id.get_group(0) ;
2283 //auto global_id = item_id.get_global_id(0);
2284
2285 for (auto i = id; i < nrows; i += item_id.get_range()[0])
2286 {
2287 auto block_id = i/ellpack_size ;
2288 auto local_id = i%ellpack_size ;
2289
2290 int block_row_offset = access_block_row_offset[block_id] ;
2291 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2292 for(int ieq=0;ieq<N;++ieq)
2293 {
2294 ValueType value = 0. ;
2295 for(std::size_t k=block_row_offset;k<block_row_offset+block_row_size;++k)
2296 {
2297 // value += access_values[k]* access_x[access_cols[k]] ;
2298 value += tile.mult(ieq,
2299 local_id,
2300 k,
2301 access_cols,
2302 access_mask,
2303 access_values,
2304 access_x) ;
2305 }
2306 access_y[i*N+ieq] += alpha*value;
2307 }
2308 }
2309 });
2310 });
2311 }
2312 break;
2313 }
2314 }
2315 // clang-format on
2316
2317#ifdef PRINT_DEBUG_INFO
2318 {
2319 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(m_values);
2320 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2321 sycl::host_accessor<uint8_t, 1, sycl::access::mode::read> mask_acc(mask) ;
2322 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> x_acc(x);
2323 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> y_acc(y);
2324 for(int irow=0;irow<nrows;++irow)
2325 {
2326 std::cout<<"XK["<<irow<<"]=\n";
2327 for(int i=0;i<N;++i)
2328 std::cout<<x_acc[irow*N+i]<<"\n";
2329 }
2330 for(int irow=0;irow<nrows;++irow)
2331 {
2332 auto ib=irow/ellpack_size ;
2333 auto il=irow%ellpack_size ;
2334 std::cout<<"USOLVE MAT["<<irow<<"]:"<<std::endl;
2335 for(int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2336 {
2337 if(mask_acc[k*ellpack_size+il])
2338 {
2339 for(int i=0;i<N;++i)
2340 {
2341 for(int j=0;j<N;++j)
2342 std::cout<<h_acc[(k*NxN+i*N+j)*ellpack_size+il]<<" ";
2343 std::cout<<std::endl;
2344 }
2345 }
2346 }
2347 std::cout<<"Y ="<<std::endl;
2348 for(int i=0;i<N;++i)
2349 std::cout<<y_acc[irow*N+i]<<"\n";
2350 }
2351 }
2352#endif
2353 }
2354
2355 template <typename ValueT, int EllPackSize>
2356 void MatrixInternal<ValueT, EllPackSize>::addUMult(ValueType alpha, ValueBufferType& x, ValueBufferType& y) const
2357 {
2358 this->addUMult(alpha, x, y, SYCLEnv::instance()->internal()->queue());
2359 }
2360
2361 template <typename ValueT, int EllPackSize>
2362 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& x, ValueBufferType& y, sycl::queue& queue) const
2363 {
2364 }
2365
2366 template <typename ValueT, int EllPackSize>
2367 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& x, ValueBufferType& y) const
2368 {
2369 this->multDiag(x, y, SYCLEnv::instance()->internal()->queue());
2370 }
2371
2372 template <typename ValueT, int EllPackSize>
2373 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& y, sycl::queue& queue) const
2374 {
2375 throw NotImplementedException(
2376 A_FUNCINFO, "MatrixInternal<ValueT, EllPackSize>::multDiag not implemented");
2377 }
2378
2379 template <typename ValueT, int EllPackSize>
2380 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& y) const
2381 {
2382 this->multDiag(y, SYCLEnv::instance()->internal()->queue());
2383 }
2384
2385 template <typename ValueT, int EllPackSize>
2386 void MatrixInternal<ValueT, EllPackSize>::computeDiag(ValueBufferType& y, sycl::queue& queue) const
2387 {
2388
2389 auto device = queue.get_device();
2390
2391 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2392 // getting the maximum work group size per thread
2393 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2394 // building the best number of global thread
2395 auto total_threads = num_groups * ellpack_size;
2396
2397 // clang-format off
2398 int N = this->m_N;
2399 int NxN = this->m_NxN;
2400 int pack_size = ellpack_size ;
2401 auto nrows = m_profile->getNRows() ;
2402 auto nnz = m_profile->getNnz() ;
2403
2404 auto internal_profile = m_profile->internal() ;
2405 auto& kcol = internal_profile->getKCol() ;
2406 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2407 auto& block_cols = internal_profile->getBlockCols() ;
2408
2409 // COMPUTE VALUES
2410 queue.submit([&](sycl::handler& cgh)
2411 {
2412 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2413 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2414 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2415 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2416 if(N==1)
2417 {
2418 cgh.parallel_for<class compute_diag>(
2419 sycl::range<1>{total_threads},
2420 [=] (sycl::item<1> item_id)
2421 {
2422 auto id = item_id.get_id(0);
2423 //auto local_id = item_id.get_local_id(0);
2424 //auto block_id = item_id.get_group(0) ;
2425 //auto global_id = item_id.get_global_id(0);
2426
2427 for (auto i = id; i < nrows; i += item_id.get_range()[0])
2428 {
2429 auto block_id = i/pack_size ;
2430 auto local_id = i%pack_size ;
2431
2432 auto begin = access_block_row_offset[block_id] ;
2433 auto end = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2434 for(int k=begin;k<end;++k)
2435 {
2436 auto kcol = k*pack_size+local_id ;
2437 if((access_cols[kcol])==int(i))
2438 access_y[i] = access_values[kcol] ;
2439 }
2440 }
2441 });
2442 }
2443 else
2444 {
2445 cgh.parallel_for<class compute_diagN>(
2446 sycl::range<1>{total_threads},
2447 [=] (sycl::item<1> item_id)
2448 {
2449 auto id = item_id.get_id(0);
2450 //auto local_id = item_id.get_local_id(0);
2451 //auto block_id = item_id.get_group(0) ;
2452 //auto global_id = item_id.get_global_id(0);
2453
2454 for (std::size_t i = id; i < nrows; i += item_id.get_range()[0])
2455 {
2456 auto block_id = i/pack_size ;
2457 auto local_id = i%pack_size ;
2458
2459 int block_row_offset = access_block_row_offset[block_id] ;
2460 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2461 for(int k=0;k<block_row_size;++k)
2462 {
2463 auto kcol = (block_row_offset+k)*pack_size+local_id ;
2464 std::size_t col = access_cols[kcol] ;
2465 if(col==i)
2466 {
2467 for(int ieq=0;ieq<N;++ieq)
2468 {
2469 auto kval = (block_row_offset+k)*NxN + ieq*N + ieq ;
2470 access_y[i*N+ieq] = access_values[kval*pack_size+local_id] ;
2471 }
2472 }
2473 }
2474 }
2475 });
2476 }
2477 });
2478 /*
2479 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(y);
2480 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> mat_acc(m_values);
2481 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2482 sycl::host_accessor<int, 1, sycl::access::mode::read> cols_acc(block_cols);
2483 for(int irow=0;irow<nrows;++irow)
2484 {
2485 auto ib=irow/ellpack_size ;
2486 auto il=irow%ellpack_size ;
2487 std::cout<<"MAT["<<irow<<","<<ib<<"]:";
2488 for(int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2489 {
2490 std::cout<<"("<<cols_acc[k*ellpack_size+il]<<",";
2491 for(int i=0;i<N;++i)
2492 for(int j=0;j<N;++j)
2493 std::cout<<mat_acc[(k*NxN+i*N+j)*ellpack_size+il]<<",";
2494 std::cout<<")"<<std::endl;
2495 }
2496 }
2497 for(int irow=0;irow<nrows;++irow)
2498 {
2499 auto ib=irow/ellpack_size ;
2500 auto il=irow%ellpack_size ;
2501 std::cout<<"DIAG["<<irow<<","<<il<<"]:";
2502 for(int i=0;i<N;++i)
2503 std::cout<<h_acc[il*N+i]<<",";
2504 std::cout<<std::endl;
2505 }*/
2506 // clang-format on
2507 }
2508
2509 template <typename ValueT, int EllPackSize>
2510 void MatrixInternal<ValueT, EllPackSize>::computeBlockDiag(ValueBufferType& y, sycl::queue& queue) const
2511 {
2512
2513 auto device = queue.get_device();
2514
2515 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2516 // getting the maximum work group size per thread
2517 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2518 // building the best number of global thread
2519 auto total_threads = num_groups * ellpack_size;
2520
2521 // clang-format off
2522 int N = this->m_N;
2523 int NxN = this->m_NxN;
2524 int pack_size = ellpack_size ;
2525 auto nrows = m_profile->getNRows() ;
2526 auto nnz = m_profile->getNnz() ;
2527
2528 assert(y.size()>=NxN*nrows) ;
2529
2530 auto internal_profile = m_profile->internal() ;
2531 auto& kcol = internal_profile->getKCol() ;
2532 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2533 auto& block_cols = internal_profile->getBlockCols() ;
2534 {
2535 ValueBufferType buffer (sycl::range<1>(nrows*NxN)) ;
2536 // COMPUTE VALUES
2537 queue.submit([&](sycl::handler& cgh)
2538 {
2539 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2540 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2541 auto matrix = m_values.template get_access<sycl::access::mode::read>(cgh);
2542 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2543 auto A = buffer.template get_access<sycl::access::mode::read_write>(cgh);
2544
2545 auto tile = Tile(N) ;
2546 cgh.parallel_for<class compute_block_diag>(
2547 sycl::range<1>{total_threads},
2548 [=] (sycl::item<1> item_id)
2549 {
2550 auto id = item_id.get_id(0);
2551 //auto local_id = item_id.get_local_id(0);
2552 //auto block_id = item_id.get_group(0) ;
2553 //auto global_id = item_id.get_global_id(0);
2554
2555 for (std::size_t global_id = id; global_id < nrows; global_id += item_id.get_range()[0])
2556 {
2557 auto block_id = global_id/pack_size ;
2558 auto local_id = global_id%pack_size ;
2559
2560 int block_row_offset = access_block_row_offset[block_id] ;
2561 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2562 for(int k=0;k<block_row_size;++k)
2563 {
2564 auto kcol = (block_row_offset+k)*ellpack_size+local_id ;
2565 std::size_t col = access_cols[kcol] ;
2566 if(col==global_id)
2567 {
2568 // Copy Diag Matrix in A
2569 for(int i=0;i<N;++i)
2570 for(int j=0;j<N;++j)
2571 A[tile._ij(local_id,i,j)] = matrix[tile._ijk(k,i,j)+local_id] ;
2572 }
2573 }
2574 }
2575 });
2576 });
2577 }
2578 // clang-format on
2579 }
2580
2581 template <typename ValueT, int EllPackSize>
2582 void MatrixInternal<ValueT, EllPackSize>::computeDiag(ValueBufferType& y) const
2583 {
2584 this->computeDiag(y, SYCLEnv::instance()->internal()->queue());
2585 }
2586
2587 template <typename ValueT, int EllPackSize>
2588 void MatrixInternal<ValueT, EllPackSize>::computeBlockDiag(ValueBufferType& y) const
2589 {
2590 this->computeBlockDiag(y, SYCLEnv::instance()->internal()->queue());
2591 }
2592
2593
2594 template <typename ValueT, int EllPackSize>
2595 void MatrixInternal<ValueT, EllPackSize>::multInvDiag(ValueBufferType& y, sycl::queue& queue) const
2596 {
2597 throw NotImplementedException(
2598 A_FUNCINFO, "MatrixInternal<ValueT, EllPackSize>::multInvDiag not implemented");
2599 }
2600
2601 template <typename ValueT, int EllPackSize>
2602 void MatrixInternal<ValueT, EllPackSize>::multInvDiag(ValueBufferType& y) const
2603 {
2604 this->multInvDiag(y, SYCLEnv::instance()->internal()->queue());
2605 }
2606
2607 template <typename ValueT, int EllPackSize>
2608 void MatrixInternal<ValueT, EllPackSize>::computeInvDiag(ValueBufferType& y, sycl::queue& queue) const
2609 {
2610
2611 auto device = queue.get_device();
2612
2613 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2614 // getting the maximum work group size per thread
2615 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2616
2617 // clang-format off
2618 int N = this->m_N;
2619 int NxN = this->m_NxN;
2620 std::size_t pack_size = ellpack_size ;
2621 auto nrows = m_profile->getNRows() ;
2622 auto nnz = m_profile->getNnz() ;
2623
2624 auto internal_profile = m_profile->internal() ;
2625 auto& kcol = internal_profile->getKCol() ;
2626 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2627 auto& block_cols = internal_profile->getBlockCols() ;
2628
2629 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
2630 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
2631 auto total_threads = blocks_target * ellpack_size;
2632
2633 // COMPUTE VALUES
2634 queue.submit([&](sycl::handler& cgh)
2635 {
2636 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2637 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2638 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2639 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2640 if(N==1)
2641 {
2642 //sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
2643 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
2644
2645 cgh.parallel_for<class compute_inv_diag>(r,
2646 [=](sycl::nd_item<1> item_id)
2647 {
2648 auto local_id = item_id.get_local_id(0);
2649 auto global_id = item_id.get_global_id(0);
2650 for (auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
2651 {
2652 auto block_id = i/pack_size ;
2653 //auto local_id = i%ellpack_size ;
2654
2655 int begin = access_block_row_offset[block_id];
2656 int end = access_block_row_offset[block_id+1];
2657 for(int k=begin;k<end;++k)
2658 {
2659 auto kcol = k*pack_size+local_id ;
2660 if((access_cols[kcol])==int(i) && (access_values[kcol]!=0) )
2661 access_y[i] = 1./access_values[kcol] ;
2662 }
2663 }
2664 });
2665
2666 /*
2667 cgh.parallel_for<class compute_inv_diag>(
2668 sycl::range<1>{total_threads},
2669 [=] (sycl::item<1> item_id)
2670 {
2671 auto id = item_id.get_id(0);
2672 //auto local_id = item_id.get_local_id(0);
2673 //auto block_id = item_id.get_group(0) ;
2674 //auto global_id = item_id.get_global_id(0);
2675
2676 for (auto i = id; i < nrows; i += item_id.get_range()[0])
2677 {
2678 auto block_id = i/ellpack_size ;
2679 auto local_id = i%ellpack_size ;*/
2680
2681 }
2682 else
2683 {
2684 //sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
2685 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
2686 cgh.parallel_for<class compute_inv_diagN>(r,
2687 [=](sycl::nd_item<1> item_id)
2688 {
2689 auto local_id = item_id.get_local_id(0);
2690 auto global_id = item_id.get_global_id(0);
2691 for (auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
2692 {
2693 auto block_id = i/pack_size ;
2694
2695 int begin = access_block_row_offset[block_id];
2696 int end = access_block_row_offset[block_id+1];
2697
2698 for(int k=begin;k<end;++k)
2699 {
2700 auto kcol = k*pack_size+local_id ;
2701 std::size_t col = access_cols[kcol] ;
2702 if(col==i)
2703 {
2704 for(int ieq=0;ieq<N;++ieq)
2705 {
2706 auto kval = k*NxN + ieq*N + ieq ;
2707 auto diag = access_values[kval*pack_size+local_id] ;
2708 if(diag!=0)
2709 access_y[i*N+ieq] = 1./diag;
2710 else
2711 access_y[i*N+ieq] = 1.;
2712 }
2713 }
2714 }
2715 }
2716 });
2717
2718 /*
2719 cgh.parallel_for<class compute_inv_diagN>(
2720 sycl::range<1>{total_threads},
2721 [=] (sycl::item<1> item_id)
2722 {
2723 auto id = item_id.get_id(0);
2724 //auto local_id = item_id.get_local_id(0);
2725 //auto block_id = item_id.get_group(0) ;
2726 //auto global_id = item_id.get_global_id(0);
2727
2728 for (std::size_t i = id; i < nrows; i += item_id.get_range()[0])
2729 {
2730 auto block_id = i/pack_size ;
2731 auto local_id = i%pack_size ;
2732*/
2733 }
2734 });
2735 /*
2736 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(y);
2737 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> mat_acc(m_values);
2738 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2739 sycl::host_accessor<int, 1, sycl::access::mode::read> cols_acc(block_cols);
2740 for(int irow=0;irow<nrows;++irow)
2741 {
2742 auto ib=irow/ellpack_size ;
2743 auto il=irow%ellpack_size ;
2744 std::cout<<"MAT["<<irow<<","<<ib<<"]:";
2745 for(int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2746 {
2747 std::cout<<"("<<cols_acc[k*ellpack_size+il]<<",";
2748 for(int i=0;i<N;++i)
2749 for(int j=0;j<N;++j)
2750 std::cout<<mat_acc[(k*NxN+i*N+j)*ellpack_size+il]<<",";
2751 std::cout<<")"<<std::endl;
2752 }
2753 }
2754 for(int irow=0;irow<nrows;++irow)
2755 {
2756 auto ib=irow/ellpack_size ;
2757 auto il=irow%ellpack_size ;
2758 std::cout<<"DIAG["<<irow<<","<<il<<"]:";
2759 for(int i=0;i<N;++i)
2760 std::cout<<h_acc[il*N+i]<<",";
2761 std::cout<<std::endl;
2762 }*/
2763 // clang-format on
2764 }
2765
2766 template <typename ValueT, int EllPackSize>
2767 void MatrixInternal<ValueT, EllPackSize>::computeInvBlockDiag(ValueBufferType& y, sycl::queue& queue) const
2768 {
2769
2770 auto device = queue.get_device();
2771
2772 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2773 // getting the maximum work group size per thread
2774 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2775 // building the best number of global thread
2776 auto total_threads = num_groups * ellpack_size;
2777
2778 // clang-format off
2779 int N = this->m_N;
2780 int NxN = this->m_NxN;
2781 int pack_size = ellpack_size ;
2782 auto nrows = m_profile->getNRows() ;
2783 auto nnz = m_profile->getNnz() ;
2784
2785 assert(y.size()>=NxN*nrows) ;
2786
2787 auto internal_profile = m_profile->internal() ;
2788 auto& kcol = internal_profile->getKCol() ;
2789 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2790 auto& block_cols = internal_profile->getBlockCols() ;
2791 {
2792 ValueBufferType buffer (sycl::range<1>(internal_profile->m_block_nrows*NxN*pack_size)) ;
2793 // COMPUTE VALUES
2794 queue.submit([&](sycl::handler& cgh)
2795 {
2796 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2797 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2798 auto matrix = m_values.template get_access<sycl::access::mode::read>(cgh);
2799 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2800 auto A = buffer.template get_access<sycl::access::mode::read_write>(cgh);
2801
2802 auto lu = LU<decltype(matrix),decltype(access_y),decltype(A)>{N,matrix} ;
2803 //sycl::local_accessor<ValueT,1> local_mem(sycl::range<1>(pack_size*NxN), cgh);
2804
2805 cgh.parallel_for<class compute_inv_block_diag>(
2806 sycl::range<1>{total_threads},
2807 [=] (sycl::item<1> item_id)
2808 {
2809 auto id = item_id.get_id(0);
2810 //auto local_id = item_id.get_local_id(0);
2811 //auto block_id = item_id.get_group(0) ;
2812 //auto global_id = item_id.get_global_id(0);
2813
2814 for (std::size_t global_id = id; global_id < nrows; global_id += item_id.get_range()[0])
2815 {
2816 auto block_id = global_id/pack_size ;
2817 auto local_id = global_id%pack_size ;
2818
2819 int begin = access_block_row_offset[block_id];
2820 int end = access_block_row_offset[block_id+1];
2821 for(int k=begin;k<end;++k)
2822 {
2823 auto kcol = k*pack_size+local_id ;
2824 std::size_t col = access_cols[kcol] ;
2825 if(col==global_id)
2826 {
2827 lu.factorize(global_id,
2828 local_id,
2829 block_id,
2830 k,
2831 A);
2832
2833 lu.inverse(global_id,
2834 local_id,
2835 block_id,
2836 A,
2837 access_y);
2838 /*
2839 {
2840 // Copy Diag Matrix in A
2841 for(int i=0;i<N;++i)
2842 for(int j=0;j<N;++j)
2843 A[tile._ij(local_id,i,j)] = matrix[tile._ijk(k,i,j)+local_id] ;
2844
2845 //Factorize A = LU
2846 for (int k = 0; k < N; ++k)
2847 {
2848 assert(A[tile._ij(local_id,k,k)] != 0);
2849 A[tile._ij(global_id,k,k)] = 1 / A[tile._ij(global_id,k,k)];
2850 for (int i = k + 1; i < N; ++i) {
2851 A[tile._ij(global_id,i,k)] *= A[tile._ij(global_id,k,k)];
2852 }
2853 for (int i = k + 1; i < N; ++i) {
2854 for (int j = k + 1; j < N; ++j) {
2855 A[tile._ij(global_id,i,j)] -= A[tile._ij(global_id,i,k)] * A[tile._ij(global_id,k,j)];
2856 }
2857 }
2858 }
2859
2860 // SET Y to Id
2861 for(int i=0;i<N;++i)
2862 for(int j=0;j<N;++j)
2863 access_y[tile._ij(global_id,i,j)] = 0. ;
2864 for(int i=0;i<m_N;++i)
2865 access_y[tile._ij(global_id,i,i)] = 1. ;
2866
2867 // L solve
2868 for (int i = 1; i < N; ++i)
2869 {
2870 for (int j = 0; j < i; ++j)
2871 {
2872 for(int k=0;k<N;++k)
2873 access_y[tile._ij(global_id,i,k)] -= A[tile._ij(global_id,i,j)] * access_y[tile._ij(global_id,j,k)];
2874 }
2875 }
2876
2877 // U solve
2878 for (int i = N - 1; i >= 0; --i)
2879 {
2880 for (int j = N - 1; j > i; --j)
2881 {
2882 for(int k=0;k<N;++k)
2883 access_y[tile._ij(global_id,i,k)] -= A[tile._ij(global_id,i,j)] * access_y[tile._ij(global_id,j,k)];
2884 }
2885 for(int k=0;k<N;++k)
2886 access_y[tile._ij(global_id,i,k)] *= A[tile._ij(global_id,i,i)];
2887 }
2888 }*/
2889 }
2890 }
2891 }
2892 });
2893 });
2894 }
2895#ifdef PRINT_DEBUG_INFO
2896 {
2897 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(y);
2898 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> mat_acc(m_values);
2899 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2900 sycl::host_accessor<int, 1, sycl::access::mode::read> cols_acc(block_cols);
2901 std::vector<ValueT> test_y(nrows*NxN) ;
2902 std::vector<ValueT> test_A(internal_profile->m_block_nrows*NxN*pack_size) ;
2903 auto lu = LU<decltype(mat_acc),decltype(test_y.data()),decltype(test_A.data())>{N,mat_acc} ;
2904 for(int irow=0;irow<nrows;++irow)
2905 {
2906 auto ib=irow/ellpack_size ;
2907 auto il=irow%ellpack_size ;
2908 alien_info([&] {
2909 cout() <<"MAT["<<irow<<","<<ib<<"]:";
2910 });
2911 for(int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2912 {
2913 alien_info([&] {
2914 cout() <<"\tCOLS["<<k<<"] = "<<cols_acc[k*pack_size+il];
2915 });
2916 /*
2917 for(int i=0;i<N;++i)
2918 {
2919 for(int j=0;j<N;++j)
2920 std::cout<<mat_acc[(k*NxN+i*N+j)*pack_size+il]<<" ";
2921 std::cout<<std::endl;
2922 }*/
2923 if(cols_acc[k*pack_size+il]==irow)
2924 {
2925
2926 alien_info([&] {
2927 cout() <<"\tCOMPUTE INVERSE["<<cols_acc[k*pack_size+il]<<"]" ;
2928 for(int i=0;i<N;++i)
2929 {
2930 for(int j=0;j<N;++j)
2931 cout()<<mat_acc[(k*NxN+i*N+j)*pack_size+il]<<" ";
2932 }
2933 });
2934 lu.factorize(irow,
2935 il,
2936 ib,
2937 k,
2938 test_A.data());
2939
2940 lu.inverse(irow,
2941 il,
2942 ib,
2943 test_A.data(),
2944 test_y.data());
2945
2946 alien_info([&] {
2947 for(int i=0;i<N;++i)
2948 {
2949 for(int j=0;j<N;++j)
2950 std::cout<<test_y[irow*NxN+i*N+j]<<",";
2951 std::cout<<std::endl;
2952 }
2953 });
2954 }
2955 }
2956 }
2957 for(int irow=0;irow<nrows;++irow)
2958 {
2959 auto ib=irow/ellpack_size ;
2960 auto il=irow%ellpack_size ;
2961 alien_info([&] {
2962 cout()<<"INV BLOCK DIAG["<<irow<<","<<il<<"]:\n";
2963 for(int i=0;i<N;++i)
2964 {
2965 for(int j=0;j<N;++j)
2966 cout()<<h_acc[irow*NxN+i*N+j]<<",";
2967 //std::cout<<std::endl;
2968 }
2969 });
2970 }
2971 }
2972#endif
2973 // clang-format on
2974 }
2975
2976 template <typename ValueT, int EllPackSize>
2977 void MatrixInternal<ValueT, EllPackSize>::computeInvDiag(ValueBufferType& y) const
2978 {
2979 this->computeInvDiag(y, SYCLEnv::instance()->internal()->queue());
2980 }
2981
2982 template <typename ValueT, int EllPackSize>
2983 void MatrixInternal<ValueT, EllPackSize>::computeInvBlockDiag(ValueBufferType& y) const
2984 {
2985 this->computeInvBlockDiag(y, SYCLEnv::instance()->internal()->queue());
2986 }
2987
2988
2989 template <typename ValueT, int EllPackSize>
2990 void MatrixInternal<ValueT, EllPackSize>::scal(ValueBufferType& y, sycl::queue& queue)
2991 {
2992
2993 auto device = queue.get_device();
2994
2995 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2996 // getting the maximum work group size per thread
2997 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2998 // building the best number of global thread
2999 auto total_threads = num_groups * ellpack_size;
3000
3001 // clang-format off
3002 auto nrows = m_profile->getNRows() ;
3003 auto nnz = m_profile->getNnz() ;
3004
3005 auto internal_profile = m_profile->internal() ;
3006 auto& kcol = internal_profile->getKCol() ;
3007 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
3008 auto& block_cols = internal_profile->getBlockCols() ;
3009 {
3010 // COMPUTE VALUES
3011 queue.submit([&](sycl::handler& cgh)
3012 {
3013 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3014 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
3015 auto access_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
3016 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
3017
3018
3019 cgh.parallel_for<class scal_matrix>(sycl::range<1>{total_threads},
3020 [=] (sycl::item<1> item_id)
3021 {
3022 auto id = item_id.get_id(0);
3023 //auto local_id = item_id.get_local_id(0);
3024 //auto block_id = item_id.get_group(0) ;
3025 //auto global_id = item_id.get_global_id(0);
3026
3027 for (auto i = id; i < nrows; i += item_id.get_range()[0])
3028 {
3029 auto block_id = i/ellpack_size ;
3030 auto local_id = i%ellpack_size ;
3031
3032 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
3033 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
3034 for(int j=0;j<block_row_size;++j)
3035 {
3036 auto k = block_row_offset+j*ellpack_size+local_id ;
3037 access_values[k] *= access_y[i] ;
3038 }
3039 }
3040 });
3041 });
3042 }
3043
3044 if(m_ext_profile)
3045 {
3046
3047 auto interface_nrows = m_ext_profile->getNRows();
3048 auto ext_nnz = m_ext_profile->getNnz();
3049 auto ext_block_nnz = m_ext_profile->getBlockNnz();
3050
3051 auto ext_internal_profile = m_ext_profile->internal();
3052 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
3053
3054 {
3055 queue.submit([&](sycl::handler& cgh)
3056 {
3057 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3058 auto access_block_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
3059 auto access_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
3060 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
3061
3062 cgh.parallel_for<class set_matrix_values7>(sycl::range<1>{total_threads},
3063 [=] (sycl::item<1> item_id)
3064 {
3065 auto id = item_id.get_id(0);
3066
3067 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
3068 {
3069 auto block_id = i/ellpack_size ;
3070 auto local_id = i%ellpack_size ;
3071 auto begin = access_block_row_offset[block_id] ;
3072 auto end = access_block_row_offset[block_id+1] ;
3073
3074 for(int k=begin;k<end;++k)
3075 {
3076 access_block_values[k*ellpack_size+local_id] *= access_y[access_row_ids[i]] ;
3077 }
3078 }
3079 });
3080 });
3081 }
3082 }
3083 }
3084
3085 template <typename ValueT, int EllPackSize>
3086 void MatrixInternal<ValueT, EllPackSize>::scal(ValueBufferType& y)
3087 {
3088 this->scal(y, SYCLEnv::instance()->internal()->queue());
3089 }
3090
3091 template <typename ValueT, int EllPackSize>
3092 void MatrixInternal<ValueT, EllPackSize>::
3093 copyDevicePointers(int local_offset,
3094 std::size_t nrows,
3095 std::size_t nnz,
3096 int* rows,
3097 int* ncols,
3098 int* cols,
3099 ValueT* values) const
3100 {
3101#ifdef PRINT_DEBUG_INFO
3102 alien_info([&] {
3103 cout()<<"SYCLMATRIX COPY DEVICE POINTERS : "<<nrows<<" "<<nnz;
3104 });
3105#endif
3106
3107 auto env = SYCLEnv::instance() ;
3108 auto max_num_threads = env->maxNumThreads() ;
3109 auto& queue = env->internal()->queue() ;
3110 auto pack_size = ellpack_size;
3111
3112 auto internal_profile = m_profile->internal() ;
3113 auto& kcol = internal_profile->getKCol() ;
3114 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
3115 auto& block_cols = internal_profile->getBlockCols() ;
3116
3117 auto local_row_size = m_profile->localRowSize();
3118 /*
3119 {
3120 std::cout<<"CHECK KCOL : "<<std::endl ;
3121 sycl::host_accessor<int, 1, sycl::access::mode::read> h_kcol(kcol);
3122 sycl::host_accessor<int, 1, sycl::access::mode::read> h_bkcol(block_row_offset);
3123 sycl::host_accessor<int, 1, sycl::access::mode::read> h_bcols(block_cols);
3124 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_values(m_values);
3125 for(int irow=0;irow<nrows;++irow)
3126 {
3127 auto block_id = irow/pack_size ;
3128 auto local_id = irow%pack_size ;
3129 std::cout<<"KCOL["<<irow<<"] = "<<h_kcol[irow]<<" ROW SIZE = "<<h_kcol[irow+1]-h_kcol[irow]<<std::endl ;
3130
3131 int begin = h_bkcol[block_id] ;
3132 int end = h_bkcol[block_id+1] ;
3133 int row_size = h_kcol[irow+1] - h_kcol[irow];
3134
3135 std::cout<<"ELLPACK MAT["<<irow<<","<<block_id<<","<<local_id<<"]:";
3136 for(int k=begin;k<end;++k)
3137 {
3138 std::cout<<h_bcols[k*pack_size+local_id]<<" "<<h_values[k*pack_size+local_id]<<" ";
3139 }
3140 std::cout<<std::endl ;
3141 }
3142 }*/
3143 if(local_row_size == nullptr)
3144 {
3145
3146 queue.submit( [&](sycl::handler& cgh)
3147 {
3148 auto acc_val = m_values.template get_access<sycl::access::mode::read>(cgh);
3149 auto acc_bkcol = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3150 auto acc_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
3151 auto acc_kcol = kcol.template get_access<sycl::access::mode::read>(cgh);
3152 cgh.parallel_for<class copy_device_ptr>(
3153 sycl::range<1>{max_num_threads},
3154 [=] (sycl::item<1> itemId)
3155 {
3156 auto id = itemId.get_id(0);
3157 for (auto i = id; i < nrows; i += itemId.get_range()[0])
3158 {
3159 auto block_id = i/pack_size ;
3160 auto local_id = i%pack_size ;
3161
3162 int begin = acc_bkcol[block_id] ;
3163 int end = acc_bkcol[block_id+1] ;
3164 int row_size = acc_kcol[i+1] - acc_kcol[i];
3165
3166 rows[i] = local_offset + i ;
3167 ncols[i] = row_size ;
3168
3169 auto offset = acc_kcol[i];
3170
3171 for(int k=begin;k<end;++k)
3172 {
3173 auto j = k-begin ;
3174 if(j<row_size)
3175 {
3176 values[offset+j] = acc_val[k*pack_size+local_id] ;
3177 cols[offset+j] = acc_cols[k*pack_size+local_id] ;
3178 }
3179 }
3180 }
3181 });
3182 }).wait();
3183#ifdef PRINT_DEBUG_INFO
3184 {
3185 std::vector<ValueT> h_data(nnz) ;
3186 queue.memcpy(h_data.data(), values, nnz * sizeof(ValueT)).wait();
3187 std::vector<int> h_cols(nnz) ;
3188 queue.memcpy(h_cols.data(), cols, nnz * sizeof(int)).wait();
3189 sycl::host_accessor<int, 1, sycl::access::mode::read> h_kcol(kcol);
3190 alien_info([&]{
3191 cout()<<"SYCL DEVICE COPY : "<<nrows<<" "<<nnz;
3192 for(int irow=0;irow<nrows;++irow)
3193 {
3194 cout()<<"COPY MAT["<<irow<<"]:"<<h_kcol[irow+1]-h_kcol[irow];
3195 for(int k=h_kcol[irow];k<h_kcol[irow+1];++k)
3196 {
3197 std::cout<<h_cols[k]<<" "<<h_data[k]<<" ";
3198 }
3199 std::cout<<std::endl ;
3200 }
3201 });
3202 }
3203#endif
3204 }
3205 else
3206 {
3207
3208 IndexBufferType lrowsize(local_row_size, sycl::range<1>(nrows));
3209 queue.submit( [&](sycl::handler& cgh)
3210 {
3211 auto acc_val = m_values.template get_access<sycl::access::mode::read>(cgh);
3212 auto acc_bkcol = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3213 auto acc_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
3214 auto acc_kcol = kcol.template get_access<sycl::access::mode::read>(cgh);
3215 auto acc_lrowsize = lrowsize.template get_access<sycl::access::mode::read>(cgh);
3216 cgh.parallel_for<class copy_device_ptr>(
3217 sycl::range<1>{max_num_threads},
3218 [=] (sycl::item<1> itemId)
3219 {
3220 auto id = itemId.get_id(0);
3221 for (auto i = id; i < nrows; i += itemId.get_range()[0])
3222 {
3223 auto block_id = i/pack_size ;
3224 auto local_id = i%pack_size ;
3225
3226 int begin = acc_bkcol[block_id] ;
3227 int end = acc_bkcol[block_id+1] ;
3228 int row_size = acc_kcol[i+1] - acc_kcol[i];
3229 int lrow_size = acc_lrowsize[i] ;
3230 rows[i] = local_offset + i ;
3231 ncols[i] = row_size ;
3232
3233 auto offset = acc_kcol[i];
3234
3235 for(int k=begin;k<end;++k)
3236 {
3237 auto j = k-begin ;
3238 if(j<lrow_size)
3239 {
3240 values[offset+j] = acc_val[k*pack_size+local_id];
3241 cols[offset+j] = local_offset + acc_cols[k*pack_size+local_id];
3242 }
3243 }
3244 }
3245 });
3246 });
3247
3248 assert(m_ext_profile) ;
3249
3250 auto interface_nrows = m_ext_profile->getNRows();
3251 auto ext_nnz = m_ext_profile->getNnz();
3252 auto ext_block_nnz = m_ext_profile->getBlockNnz();
3253
3254 auto ext_internal_profile = m_ext_profile->internal();
3255 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
3256 auto& ext_block_cols = ext_internal_profile->getBlockCols() ;
3257
3258 queue.submit([&](sycl::handler& cgh)
3259 {
3260 auto acc_kcol = kcol.template get_access<sycl::access::mode::read>(cgh);
3261 auto acc_ext_bkcol = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3262 auto acc_ext_cols = ext_block_cols.template get_access<sycl::access::mode::read>(cgh);
3263 auto acc_ext_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
3264 auto acc_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
3265 auto acc_lrowsize = lrowsize.template get_access<sycl::access::mode::read>(cgh);
3266 auto acc_recv_uids = m_recv_uids->template get_access<sycl::access::mode::read>(cgh);
3267
3268 cgh.parallel_for<class ext_copy_device_ptr>(
3269 sycl::range<1>{max_num_threads},
3270 [=] (sycl::item<1> item_id)
3271 {
3272 auto id = item_id.get_id(0);
3273
3274 for (auto i = id; i < interface_nrows; i += item_id.get_range()[0])
3275 {
3276 auto block_id = i/pack_size ;
3277 auto local_id = i%pack_size ;
3278 auto begin = acc_ext_bkcol[block_id] ;
3279 auto end = acc_ext_bkcol[block_id+1] ;
3280
3281 auto row_id = acc_row_ids[i];
3282 int row_size = acc_kcol[row_id+1] - acc_kcol[row_id];
3283 int lrow_size = acc_lrowsize[row_id] ;
3284 auto offset = acc_kcol[row_id] + lrow_size;
3285
3286 for(int k=begin;k<end;++k)
3287 {
3288 auto j = k-begin ;
3289 if(j<row_size-lrow_size)
3290 {
3291 values[offset+j] = acc_ext_values[k*pack_size+local_id] ;
3292 cols[offset+j] = acc_recv_uids[acc_ext_cols[k*pack_size+local_id]] ;
3293 }
3294 }
3295 }
3296 });
3297 });
3298#ifdef PRINT_DEBUG_INFO
3299 {
3300 std::vector<ValueT> h_data(nnz) ;
3301 queue.memcpy(h_data.data(), values, nnz * sizeof(ValueT)).wait();
3302 std::vector<int> h_cols(nnz) ;
3303 queue.memcpy(h_cols.data(), cols, nnz * sizeof(int)).wait();
3304 sycl::host_accessor<int, 1, sycl::access::mode::read> h_kcol(kcol);
3305 alien_info([&] {
3306 cout()<<"SYCL DEVICE COPY : "<<nrows<<" "<<nnz<<" "<<ext_nnz<<" "<<interface_nrows;
3307 for(int irow=0;irow<nrows;++irow)
3308 {
3309 std::cout<<"COPY MAT["<<irow<<"]:"<<local_row_size[irow]<<" "<<h_kcol[irow+1]-h_kcol[irow];
3310 for(int k=h_kcol[irow];k<h_kcol[irow+1];++k)
3311 {
3312 std::cout<<h_cols[k]<<" "<<h_data[k]<<" ";
3313 }
3314 std::cout<<std::endl ;
3315 }
3316 });
3317 }
3318#endif
3319 }
3320 }
3321} // namespace SYCLInternal
3322
3323
3324
3325
3326template <typename ValueT>
3328: IMatrixImpl(nullptr, AlgebraTraits<BackEnd::tag::sycl>::name())
3329, m_send_policy(SimpleCSRInternal::CommProperty::ASynch)
3330, m_recv_policy(SimpleCSRInternal::CommProperty::ASynch)
3331{}
3332
3333template <typename ValueT>
3335: IMatrixImpl(multi_impl, AlgebraTraits<BackEnd::tag::sycl>::name())
3336, m_send_policy(SimpleCSRInternal::CommProperty::ASynch)
3337, m_recv_policy(SimpleCSRInternal::CommProperty::ASynch)
3338{}
3339
3340template <typename ValueT>
3342{
3343#ifdef ALIEN_USE_PERF_TIMER
3344 m_timer.printInfo("SYCLBELLPACK-MATRIX");
3345#endif
3346}
3347
3348template <typename ValueT>
3350initMatrix(Arccore::MessagePassing::IMessagePassingMng* parallel_mng,
3351 Integer local_offset,
3352 Integer global_size,
3353 std::size_t nrows,
3354 int const* kcol,
3355 int const* cols,
3356 SimpleCSRInternal::DistStructInfo const& matrix_dist_info,
3357 int block_size)
3358{
3359#ifdef PRINT_DEBUG_INFO
3360 alien_debug([&] {
3361 cout()<<"SYCLBellPackMatrix::initMatrix : "<<local_offset<<" "<<nrows<<" "<<global_size;
3362 cout()<<"ParalleMng : "<<parallel_mng;
3363 });
3364#endif
3365
3366 m_nproc = 1;
3367 m_myrank = 0;
3368 m_parallel_mng = parallel_mng;
3369 if (parallel_mng) {
3370 m_nproc = m_parallel_mng->commSize();
3371 m_myrank = m_parallel_mng->commRank();
3372 }
3373 m_is_parallel = (m_nproc > 1);
3374
3375
3376 m_matrix_dist_info.copy(matrix_dist_info);
3377
3378 m_ellpack_size = PKSIZE;
3379 if (m_nproc > 1)
3380 {
3381 m_local_size = nrows;
3382 m_local_offset = local_offset;
3383 m_global_size = global_size;
3384
3385 m_ghost_size = m_matrix_dist_info.m_ghost_nrow;
3386
3387 auto& local_row_size = m_matrix_dist_info.m_local_row_size;
3388 auto sorted_cols = m_matrix_dist_info.m_cols.data();
3389
3390 std::size_t block_nrows = ProfileInternalType::nbBlocks(nrows);
3391
3392 ProfileInternalType::computeBlockRowOffset(m_block_row_offset, nrows, kcol);
3393
3394 // clang-format off
3395 alien_debug([&] {
3396 cout() << "OFFSET = "<<m_local_offset ;
3397 cout() << "NROWS = "<<nrows;
3398 cout() << "NNZ = "<<kcol[nrows];
3399 cout() << "BNROWS = "<<block_nrows;
3400 cout() << "BNNZ = "<<m_block_row_offset[block_nrows];
3401 });
3402 // clang-format on
3403 //Universe().traceMng()->flush();
3404 m_profilePKSIZE.reset( new ProfileInternalType{ nrows,
3405 kcol,
3406 sorted_cols,
3407 m_block_row_offset.data(),
3408 local_row_size.data() });
3409
3410 m_matrixPKSIZE.reset(new MatrixInternalType{ m_profilePKSIZE.get(), block_size });
3411
3412 // EXTRACT EXTERNAL PROFILE
3413 std::size_t interface_nrows = m_matrix_dist_info.m_interface_nrow;
3414 std::vector<int> ext_kcol(interface_nrows + 1);
3415
3416 int ext_nnz = 0;
3417 for (std::size_t i = 0; i < interface_nrows; ++i) {
3418 auto id = m_matrix_dist_info.m_interface_rows[i];
3419 ext_kcol[i] = ext_nnz;
3420 ext_nnz += kcol[id + 1] - kcol[id] - local_row_size[id];
3421 }
3422 ext_kcol[interface_nrows] = ext_nnz;
3423
3424 std::vector<int> ext_cols(ext_nnz);
3425 {
3426 int jcol = 0;
3427 for (std::size_t i = 0; i < interface_nrows; ++i) {
3428 auto id = m_matrix_dist_info.m_interface_rows[i];
3429 for (int k = kcol[id] + local_row_size[id]; k < kcol[id + 1]; ++k) {
3430 ext_cols[jcol++] = (int)(sorted_cols[k] - nrows);
3431 }
3432 }
3433 }
3434
3435 std::size_t ext_block_nrows = ProfileInternalType::nbBlocks(interface_nrows);
3436 ProfileInternalType::computeBlockRowOffset(m_ext_block_row_offset, interface_nrows, ext_kcol.data());
3437 // clang-format off
3438 alien_debug([&] {
3439 cout() << "EXT NROWS = "<<interface_nrows;
3440 cout() << "EXT NNZ = "<<ext_kcol[interface_nrows];
3441 cout() << "EXT BNROWS = "<<ext_block_nrows;
3442 cout() << "EXT BNNZ = "<<m_block_row_offset[ext_block_nrows];
3443 });
3444 // clang-format on
3445
3446 m_ext_profilePKSIZE.reset( new ProfileInternalType{ interface_nrows,
3447 ext_kcol.data(),
3448 ext_cols.data(),
3449 m_ext_block_row_offset.data(),
3450 nullptr });
3451
3452 m_matrixPKSIZE->m_ext_profile = m_ext_profilePKSIZE.get();
3453 typedef typename MatrixInternalType::IndexBufferType IndexBufferType;
3454 m_matrixPKSIZE->m_h_interface_row_ids = m_matrix_dist_info.m_interface_rows.data();
3455 m_matrixPKSIZE->m_interface_row_ids.reset(new IndexBufferType{ m_matrix_dist_info.m_interface_rows.data(),
3456 m_matrix_dist_info.m_interface_rows.size() });
3457
3458 m_matrixPKSIZE->m_send_ids.reset(new IndexBufferType{ m_matrix_dist_info.m_send_info.m_ids.data(),
3459 m_matrix_dist_info.m_send_info.m_ids.size() });
3460 m_matrixPKSIZE->m_recv_ids.reset(new IndexBufferType{ m_matrix_dist_info.m_recv_info.m_ids.data(),
3461 m_matrix_dist_info.m_recv_info.m_ids.size() });
3462 m_matrixPKSIZE->m_recv_uids.reset(new IndexBufferType{ m_matrix_dist_info.m_recv_info.m_uids.data(),
3463 m_matrix_dist_info.m_recv_info.m_uids.size() });
3464 }
3465 else
3466 {
3467 // clang-format off
3468 m_local_offset = 0;
3469 m_local_size = nrows;
3470 m_global_size = m_local_size;
3471 m_ghost_size = 0;
3472 // clang-format on
3473
3474 std::size_t block_nrows = ProfileInternalType::nbBlocks(nrows);
3475
3476 ProfileInternalType::computeBlockRowOffset(m_block_row_offset, nrows, kcol);
3477
3478 // clang-format off
3479 alien_debug([&] {
3480 cout() << "NROWS = "<<nrows;
3481 cout() << "NNZ = "<<kcol[nrows];
3482 cout() << "BNROWS = "<<block_nrows;
3483 cout() << "BNNZ = "<<m_block_row_offset[block_nrows];
3484 });
3485 // clang-format on
3486
3487 m_profilePKSIZE.reset( new ProfileInternalType{ nrows,
3488 kcol,
3489 cols,
3490 m_block_row_offset.data(),
3491 nullptr });
3492
3493 m_matrixPKSIZE.reset( new MatrixInternalType{ m_profilePKSIZE.get(), block_size });
3494 }
3495
3496 return true;
3497}
3498
3499template <typename ValueT>
3501SYCLBEllPackMatrix<ValueT>::cloneTo(const MultiMatrixImpl* multi) const
3502{
3503 auto block_size = blockSize() ;
3505 matrix->setBlockSize(block_size);
3506 matrix->initMatrix(m_parallel_mng,
3507 m_local_offset,
3508 m_global_size,
3509 m_profilePKSIZE->getNRows(),
3510 m_profilePKSIZE->kcol(),
3511 m_profilePKSIZE->cols(),
3512 m_matrix_dist_info,
3513 block_size);
3514 matrix->setMatrixValues(getAddressData(), true);
3515 return matrix;
3516}
3517
3518template <typename ValueT>
3519bool SYCLBEllPackMatrix<ValueT>::setMatrixValues(Arccore::Real const* values, bool only_host)
3520{
3521 return m_matrixPKSIZE->setMatrixValues(values, only_host);
3522}
3523
3524template <typename ValueT>
3525void SYCLBEllPackMatrix<ValueT>::notifyChanges()
3526{
3527 if (m_matrixPKSIZE.get())
3528 m_matrixPKSIZE->notifyChanges();
3529}
3530
3531template <typename ValueT>
3532void SYCLBEllPackMatrix<ValueT>::endUpdate()
3533{
3534 if (m_matrixPKSIZE.get() && m_matrixPKSIZE->needUpdate()) {
3535 m_matrixPKSIZE->endUpdate();
3536 this->updateTimestamp();
3537 }
3538}
3539
3540template <typename ValueT>
3541ValueT const* SYCLBEllPackMatrix<ValueT>::getAddressData() const
3542{
3543 return m_matrixPKSIZE->getHCsrData();
3544}
3545
3546template <typename ValueT>
3547ValueT const* SYCLBEllPackMatrix<ValueT>::data() const
3548{
3549 return m_matrixPKSIZE->getHCsrData();
3550}
3551
3552template <typename ValueT>
3553ValueT* SYCLBEllPackMatrix<ValueT>::getAddressData()
3554{
3555 return m_matrixPKSIZE->getHCsrData();
3556}
3557
3558template <typename ValueT>
3559ValueT* SYCLBEllPackMatrix<ValueT>::data()
3560{
3561 return m_matrixPKSIZE->getHCsrData();
3562}
3563
3564template <typename ValueT>
3565void SYCLBEllPackMatrix<ValueT>::mult(SYCLVector<ValueT> const& x, SYCLVector<ValueT>& y) const
3566{
3567 return m_matrixPKSIZE->mult(x.internal()->values(), y.internal()->values());
3568}
3569
3570template <typename ValueT>
3571void SYCLBEllPackMatrix<ValueT>::endDistMult(SYCLVector<ValueT> const& x, SYCLVector<ValueT>& y) const
3572{
3573 m_matrixPKSIZE->addExtMult(x.internal()->ghostValues(getGhostSize()), y.internal()->values());
3574}
3575
3576template <typename ValueT>
3577void SYCLBEllPackMatrix<ValueT>::addLMult(ValueT alpha, SYCLVector<ValueT> const& x, SYCLVector<ValueT>& y) const
3578{
3579 m_profilePKSIZE->dcol();
3580 return m_matrixPKSIZE->addLMult(alpha, x.internal()->values(), y.internal()->values());
3581}
3582
3583template <typename ValueT>
3584void SYCLBEllPackMatrix<ValueT>::addUMult(ValueT alpha, SYCLVector<ValueT> const& x, SYCLVector<ValueT>& y) const
3585{
3586 m_profilePKSIZE->dcol();
3587 return m_matrixPKSIZE->addUMult(alpha, x.internal()->values(), y.internal()->values());
3588}
3589
3590template <typename ValueT>
3591void SYCLBEllPackMatrix<ValueT>::multDiag(SYCLVector<ValueType> const& x, SYCLVector<ValueType>& y) const
3592{
3593 return m_matrixPKSIZE->multDiag(x.internal()->values(), y.internal()->values());
3594}
3595
3596template <typename ValueT>
3597void SYCLBEllPackMatrix<ValueT>::multDiag(SYCLVector<ValueType>& y) const
3598{
3599 return m_matrixPKSIZE->multDiag(y.internal()->values());
3600}
3601
3602template <typename ValueT>
3603void SYCLBEllPackMatrix<ValueT>::computeDiag(SYCLVector<ValueType>& y) const
3604{
3605 auto block_size = blockSize() ;
3606 if((y.blockSize()==1)||(y.blockSize()==block_size))
3607 return m_matrixPKSIZE->computeDiag(y.internal()->values());
3608 else if(y.blockSize()==block_size*block_size)
3609 return m_matrixPKSIZE->computeBlockDiag(y.internal()->values());
3610 else
3611 throw Arccore::FatalErrorException(A_FUNCINFO, "Matrix and Vector Block Size are not compatibility");
3612}
3613
3614template <typename ValueT>
3615void SYCLBEllPackMatrix<ValueT>::multInvDiag(SYCLVector<ValueType>& y) const
3616{
3617 return m_matrixPKSIZE->multInvDiag(y.internal()->values());
3618}
3619
3620template <typename ValueT>
3621void SYCLBEllPackMatrix<ValueT>::computeInvDiag(SYCLVector<ValueType>& y) const
3622{
3623 auto block_size = blockSize() ;
3624 if((y.blockSize()==1)||(y.blockSize()==block_size))
3625 return m_matrixPKSIZE->computeInvDiag(y.internal()->values());
3626 else if(y.blockSize()==block_size*block_size)
3627 return m_matrixPKSIZE->computeInvBlockDiag(y.internal()->values());
3628 else
3629 throw Arccore::FatalErrorException(A_FUNCINFO, "Matrix and Vector Block Size are not compatibility");
3630}
3631
3632template <typename ValueT>
3633void SYCLBEllPackMatrix<ValueT>::scal(SYCLVector<ValueType> const& diag)
3634{
3635 return m_matrixPKSIZE->scal(diag.internal()->values());
3636}
3637
3638
3639template <typename ValueT>
3640void SYCLBEllPackMatrix<ValueT>::copy(SYCLBEllPackMatrix<ValueT> const& matrix)
3641{
3642 auto block_size = blockSize() ;
3643 initMatrix(matrix.m_parallel_mng,
3644 matrix.m_local_offset,
3645 matrix.m_global_size,
3646 matrix.m_profilePKSIZE->getNRows(),
3647 matrix.m_profilePKSIZE->kcol(),
3648 matrix.m_profilePKSIZE->cols(),
3649 matrix.m_matrix_dist_info,
3650 block_size);
3651 if(block_size==matrix.blockSize())
3652 {
3653 if(m_is_parallel)
3654 m_matrixPKSIZE->setMatrixValues(matrix.m_matrixPKSIZE->m_values,
3655 (*matrix.m_matrixPKSIZE->m_ext_values));
3656 else
3657 m_matrixPKSIZE->setMatrixValues(matrix.m_matrixPKSIZE->m_values);
3658 }
3659 else
3660 {
3661 assert(m_profilePKSIZE.get());
3662 assert(m_matrixPKSIZE.get());
3663 auto nb_blocks = m_profilePKSIZE->getBlockNnz();
3664 if(m_is_parallel)
3665 m_matrixPKSIZE->copy(nb_blocks,
3666 block_size,
3667 matrix.m_matrixPKSIZE->m_values,
3668 (*matrix.m_matrixPKSIZE->m_ext_values),
3669 matrix.blockSize()) ;
3670 else
3671 m_matrixPKSIZE->copy(nb_blocks,
3672 block_size,
3673 matrix.m_matrixPKSIZE->m_values,
3674 matrix.blockSize()) ;
3675 }
3676}
3677
3678template <typename ValueT>
3680allocateDevicePointers(std::size_t nrows,
3681 std::size_t nnz,
3682 int** rows,
3683 int** ncols,
3684 int** cols,
3685 ValueT** values) const
3686{
3687 auto env = SYCLEnv::instance() ;
3688 auto max_num_treads = env->maxNumThreads() ;
3689 auto& queue = env->internal()->queue() ;
3690 auto rows_ptr = malloc_device<IndexType>(nrows, queue);
3691 auto ncols_ptr = malloc_device<IndexType>(nrows, queue);
3692 auto cols_ptr = malloc_device<IndexType>(nnz, queue);
3693 auto values_ptr = malloc_device<ValueT>(nnz, queue);
3694
3695 *rows = rows_ptr ;
3696 *ncols = ncols_ptr ;
3697 *cols = cols_ptr ;
3698 *values = values_ptr ;
3699}
3700
3701template <typename ValueT>
3703freeDevicePointers(int* rows, int* ncols, int* cols, ValueT* values) const
3704{
3705 auto env = SYCLEnv::instance() ;
3706 auto& queue = env->internal()->queue() ;
3707 sycl::free(rows,queue) ;
3708 sycl::free(ncols,queue) ;
3709 sycl::free(cols,queue) ;
3710 sycl::free(values,queue) ;
3711}
3712
3713
3714template <typename ValueT>
3716copyDevicePointers(std::size_t nrows,
3717 std::size_t nnz,
3718 int* rows,
3719 int* ncols,
3720 int* cols,
3721 ValueT* values) const
3722{
3723 m_matrixPKSIZE->copyDevicePointers(m_local_offset,nrows,nnz,rows,ncols,cols,values) ;
3724}
3725
3726
3727template <typename ValueT>
3728SYCLBEllPackMatrix<ValueT>::HCSRView
3729SYCLBEllPackMatrix<ValueT>::hcsrView(BackEnd::Memory::eType memory, int nrows, int nnz) const
3730{
3731 return HCSRView(this,memory, nrows, nnz) ;
3732}
3733/*---------------------------------------------------------------------------*/
3734
3735template class ALIEN_EXPORT SYCLBEllPackMatrix<double>;
3736template class ALIEN_EXPORT BEllPackStructInfo<PKSIZE, Integer>;
3737template class ALIEN_EXPORT SYCLInternal::MatrixInternal<double,PKSIZE>;
3738
3739
3740//template bool Alien::SYCLInternal::MatrixInternal<double,1014>::setMatrixValues(Alien::SYCLInternal::MatrixInternal<double,1014>::ValueBufferType& buffer) ;
3741// to force instanciation may be there is a better way to do it
3742void force_int_instance()
3743{
3745 SYCLBEllPackMatrix<double>::MatrixInternalType::ValueBufferType buffer{sycl::range<1>(10)} ;
3746 matrix.internal()->setMatrixValues(buffer) ;
3747}
3748/*---------------------------------------------------------------------------*/
3749
3750} // namespace Alien
3751
3752/*---------------------------------------------------------------------------*/
IMatrixImpl(const MultiMatrixImpl *multi_impl, BackEndId backend="")
Constructor.
Multi matrices representation container.
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --
Definition BackEnd.h:17