27#include <alien/kernels/sycl/data/SYCLEnvInternal.h>
28#include <alien/kernels/sycl/data/SYCLEnv.h>
30#include <alien/kernels/sycl/data/SYCLVectorInternal.h>
31#include <alien/kernels/sycl/data/SYCLVector.h>
33#include <alien/kernels/sycl/data/SYCLBEllPackInternal.h>
34#include <alien/kernels/sycl/data/SYCLBEllPackMatrix.h>
43 SYCLInternal::EnvInternal::printPlatformInfo() ;
44 m_internal.reset(
new SYCLInternal::EnvInternal{});
47SYCLEnv::SYCLEnv(
int device_id)
49 SYCLInternal::EnvInternal::printPlatformInfo() ;
50 auto gpu_devices = sycl::device::get_devices(sycl::info::device_type::gpu);
51 if(device_id < gpu_devices.size())
53 sycl::device selected_device = gpu_devices[device_id];
54 m_internal.reset(
new SYCLInternal::EnvInternal{selected_device,device_id});
57 m_internal.reset(
new SYCLInternal::EnvInternal{});
65SYCLEnv* SYCLEnv::m_instance =
nullptr;
70 m_instance =
new SYCLEnv{};
74SYCLEnv* SYCLEnv::instance(
int device_id)
77 m_instance =
new SYCLEnv{device_id};
78 if (m_instance->deviceId()!= device_id)
81 m_instance =
new SYCLEnv{device_id};
86int SYCLEnv::deviceId()
88 return m_internal->deviceId() ;
91std::size_t SYCLEnv::maxNumGroups()
93 return m_internal->maxNumGroups();
96std::size_t SYCLEnv::maxWorkGroupSize()
98 return m_internal->maxWorkGroupSize();
101std::size_t SYCLEnv::maxNumThreads()
103 return m_internal->maxNumThreads();
106template <
int EllpackSize,
typename IndexT>
107void BEllPackStructInfo<EllpackSize, IndexT>::computeBlockRowOffset(std::vector<int>& block_row_offset,
111 std::size_t block_nrows = BEllPackStructInfo<EllpackSize, IndexT>::nbBlocks(nrows);
112 block_row_offset.resize(block_nrows + 1);
114 for (std::size_t ib = 0; ib < block_nrows; ++ib) {
115 block_row_offset[ib] = offset;
116 int max_row_size = 0;
117 for (
int i = 0; i < std::min(EllpackSize,
int(nrows - ib * EllpackSize)); ++i) {
118 auto irow = ib * EllpackSize + i;
119 int row_size = kcol[irow + 1] - kcol[irow];
120 max_row_size = std::max(max_row_size, row_size);
122 offset += max_row_size;
124 block_row_offset[block_nrows] = offset;
127template <
int EllpackSize,
typename IndexT>
131 std::size_t block_nrows,
132 std::size_t block_nnz,
135 int const* h_block_row_offset,
136 int const* h_local_row_size)
139, m_block_nrows(block_nrows)
140, m_block_nnz(block_nnz)
141, m_h_kcol(h_kcol, h_kcol + nrows + 1)
142, m_h_cols(h_cols, h_cols + nnz)
143, m_h_block_cols(block_nnz * ellpack_size)
144, m_block_row_offset(h_block_row_offset, sycl::range<1>(m_block_nrows + 1))
145, m_block_cols(m_h_block_cols.data(), sycl::range<1>(m_block_nnz * ellpack_size))
146, m_kcol(m_h_kcol.data(), sycl::range<1>(m_nrows + 1))
148 auto env = SYCLEnv::instance();
150 auto& queue = env->internal()->queue();
152 auto num_groups = env->internal()->maxNumGroups();
154 auto max_work_group_size = env->internal()->maxWorkGroupSize();
157 auto total_threads = num_groups * ellpack_size;
160 if(h_local_row_size==
nullptr)
162 IndexBufferType cols_buffer(m_h_cols.data(), sycl::range<1>(m_nnz));
164 queue.submit([&](sycl::handler& cgh)
166 auto access_kcol_buffer = m_kcol.template get_access<sycl::access::mode::read>(cgh);
167 auto access_block_row_offset = m_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
169 auto access_cols_buffer = cols_buffer.template get_access<sycl::access::mode::read>(cgh);
170 auto access_block_cols = m_block_cols.template get_access<sycl::access::mode::discard_write>(cgh);
172 cgh.parallel_for<
class struct_info_sycl>(sycl::range<1>{total_threads},
173 [=] (sycl::item<1> itemId)
175 auto id = itemId.get_id(0);
177 for (
auto i =
id; i < nrows; i += itemId.get_range()[0])
179 auto block_id = i/ellpack_size ;
180 auto local_id = i%ellpack_size ;
182 int begin = access_kcol_buffer[i] ;
183 int end = access_kcol_buffer[i+1] ;
184 int row_size = end - begin ;
186 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
187 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
196 for(
int k=begin;k<end;++k)
198 access_block_cols[block_row_offset+(k-begin)*ellpack_size+local_id] = access_cols_buffer[k] ;
200 for(
int k=row_size;k<block_row_size;++k)
202 access_block_cols[block_row_offset+k*ellpack_size+local_id] = -1 ;
211 IndexBufferType cols_buffer(m_h_cols.data(), sycl::range<1>(m_nnz));
212 IndexBufferType lrowsize_buffer(h_local_row_size, sycl::range<1>(nrows));
214 queue.submit([&](sycl::handler& cgh)
216 auto access_kcol_buffer = m_kcol.template get_access<sycl::access::mode::read>(cgh);
217 auto access_block_row_offset = m_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
218 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
220 auto access_cols_buffer = cols_buffer.template get_access<sycl::access::mode::read>(cgh);
221 auto access_block_cols = m_block_cols.template get_access<sycl::access::mode::discard_write>(cgh);
223 cgh.parallel_for<
class struct_info_sycl2>(sycl::range<1>{total_threads},
224 [=] (sycl::item<1> itemId)
226 auto id = itemId.get_id(0);
228 for (
auto i =
id; i < nrows; i += itemId.get_range()[0])
230 auto block_id = i/ellpack_size ;
231 auto local_id = i%ellpack_size ;
233 int begin = access_kcol_buffer[i] ;
234 int lrow_size = access_lrowsize_buffer[i] ;
236 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
237 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
239 for(
int k=begin;k<begin+lrow_size;++k)
241 access_block_cols[block_row_offset+(k-begin)*ellpack_size+local_id] = access_cols_buffer[k] ;
243 for(
int k=lrow_size;k<block_row_size;++k)
245 access_block_cols[block_row_offset+k*ellpack_size+local_id] = -1 ;
254template <
int EllpackSize,
typename IndexT>
255void SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::getUpperDiagOffset()
const
257 if (m_h_dcol.size() == 0) {
258 m_h_dcol.resize(m_nrows);
259 for (std::size_t irow = 0; irow < m_nrows; ++irow) {
260 int index = m_h_kcol[irow];
261 for (
int k = m_h_kcol[irow]; k < m_h_kcol[irow + 1]; ++k) {
262 if ((std::size_t)m_h_cols[k] < irow)
267 m_h_dcol[irow] = index;
272template <
int EllpackSize,
typename IndexT>
273void SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::computeLowerUpperMask()
const
276 if (not m_lower_upper_mask_ready) {
277 m_lower_mask.reset(
new MaskBufferType(sycl::range<1>(m_block_nnz * ellpack_size)));
278 m_upper_mask.reset(
new MaskBufferType(sycl::range<1>(m_block_nnz * ellpack_size)));
280 auto env = SYCLEnv::instance();
282 auto& queue = env->internal()->queue();
284 auto num_groups = env->internal()->maxNumGroups();
286 auto max_work_group_size = env->internal()->maxWorkGroupSize();
289 auto total_threads = num_groups * ellpack_size;
292 IndexBufferType dcol_buffer(m_h_dcol.data(), sycl::range<1>(m_nrows));
294 auto nrows = m_nrows;
297 queue.submit([&](sycl::handler& cgh)
299 auto access_dcol_buffer = dcol_buffer.template get_access<sycl::access::mode::read>(cgh);
301 auto access_kcol_buffer = m_kcol.template get_access<sycl::access::mode::read>(cgh);
302 auto access_block_row_offset = m_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
304 auto access_lower_mask = sycl::accessor { *m_lower_mask, cgh, sycl::write_only, sycl::property::no_init{}};
305 auto access_upper_mask = sycl::accessor { *m_upper_mask, cgh, sycl::write_only, sycl::property::no_init{}};
308 cgh.parallel_for<
class lower_upper_mask>(sycl::range<1>{total_threads},
309 [=] (sycl::item<1> itemId)
311 auto id = itemId.get_id(0);
313 for (
auto i =
id; i < nrows; i += itemId.get_range()[0])
315 auto block_id = i/ellpack_size ;
316 auto local_id = i%ellpack_size ;
318 int begin = access_kcol_buffer[i] ;
319 int end = access_kcol_buffer[i+1] ;
320 int diag = access_dcol_buffer[i] ;
321 int row_size = end - begin ;
323 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
324 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
326 access_lower_mask[block_row_offset+(diag-begin)*ellpack_size+local_id] = 0 ;
327 access_upper_mask[block_row_offset+(diag-begin)*ellpack_size+local_id] = 0 ;
329 for(
int k=begin;k<diag;++k)
331 access_lower_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 1 ;
332 access_upper_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 0 ;
334 for(
int k=diag+1;k<end;++k)
336 access_lower_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 0 ;
337 access_upper_mask[block_row_offset+(k-begin)*ellpack_size+local_id] = 1 ;
339 for(
int k=row_size;k<block_row_size;++k)
341 access_lower_mask[block_row_offset+k*ellpack_size+local_id] = 0 ;
342 access_upper_mask[block_row_offset+k*ellpack_size+local_id] = 0 ;
350 m_lower_upper_mask_ready =
true;
354template <
int EllpackSize,
typename IndexT>
355typename SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::MaskBufferType&
356SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::getLowerMask()
const
358 computeLowerUpperMask();
359 return *m_lower_mask;
362template <
int EllpackSize,
typename IndexT>
363typename SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::MaskBufferType&
364SYCLInternal::StructInfoInternal<EllpackSize, IndexT>::getUpperMask()
const
366 computeLowerUpperMask();
367 return *m_upper_mask;
370template <
int EllpackSize,
typename IndexT>
371BEllPackStructInfo<EllpackSize, IndexT>::BEllPackStructInfo(std::size_t nrows,
374 int const* h_block_row_offset,
375 int const* h_local_row_size)
377: BaseBEllPackStructInfo(nrows, kcol[nrows])
378, m_block_nrows(BEllPackStructInfo<EllpackSize, IndexT>::nbBlocks(nrows))
379, m_block_nnz(h_block_row_offset[m_block_nrows])
380, m_h_local_row_size(h_local_row_size)
384 cout() <<
"COMPUTE BELLPACK PROFILE";
385 cout() <<
" NROWS : "<<this->m_nrows;
386 cout() <<
" NNZ : "<<this->m_nnz;
387 cout() <<
" BLOCK NROWS : "<<m_block_nrows;
388 cout() <<
" BLOCK NNZ : "<<m_block_nnz;
393 m_internal =
new InternalType{ this->m_nrows,
403template <
int EllpackSize,
typename IndexT>
404BEllPackStructInfo<EllpackSize, IndexT>::~BEllPackStructInfo()
409template <
int EllpackSize,
typename IndexT>
410typename BEllPackStructInfo<EllpackSize, IndexT>::IndexType
const*
411BEllPackStructInfo<EllpackSize, IndexT>::kcol()
const
413 return m_internal->kcol();
416template <
int EllpackSize,
typename IndexT>
417typename BEllPackStructInfo<EllpackSize, IndexT>::IndexType
const*
418BEllPackStructInfo<EllpackSize, IndexT>::cols()
const
420 return m_internal->cols();
423template <
int EllpackSize,
typename IndexT>
424typename BEllPackStructInfo<EllpackSize, IndexT>::IndexType
const*
425BEllPackStructInfo<EllpackSize, IndexT>::dcol()
const
427 return m_internal->dcol();
430namespace SYCLInternal
432 template <
typename ValueT,
int EllpackSize>
433 MatrixInternal<ValueT, EllpackSize>::MatrixInternal(ProfileType
const* profile,
int N)
437 , m_h_values(profile->getBlockNnz() * ellpack_size * N*N)
438 , m_values(m_h_values.data(), sycl::range<1>(profile->getBlockNnz() * ellpack_size * N*N))
444 template <
typename ValueT,
int EllpackSize>
445 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValuesFromHost()
448 auto env = SYCLEnv::instance();
449 auto& queue = env->internal()->queue();
450 auto num_groups = env->internal()->maxNumGroups();
451 auto max_work_group_size = env->internal()->maxWorkGroupSize();
452 auto total_threads = num_groups * ellpack_size;
455 int NxN = this->m_NxN;
456 int pack_size = ellpack_size ;
457 auto nrows = m_profile->getNRows();
458 auto nnz = m_profile->getNnz();
459 auto block_nnz = m_profile->getBlockNnz();
461 auto internal_profile = m_profile->internal();
462 auto& kcol = internal_profile->getKCol();
463 auto& block_row_offset = internal_profile->getBlockRowOffset();
464 auto& block_cols = internal_profile->getBlockCols();
465 auto local_row_size = m_profile->localRowSize();
468 if (local_row_size ==
nullptr)
470 assert(m_h_csr_values.size()>=nnz) ;
471 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
474 queue.submit([&](sycl::handler& cgh)
476 auto access_kcol_buffer = kcol.template get_access<sycl::access::mode::read>(cgh);
477 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
478 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
479 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
482 cgh.parallel_for<
class set_matrix_values>(
483 sycl::range<1>{total_threads},
484 [=] (sycl::item<1> item_id)
486 auto id = item_id.get_id(0);
491 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
493 auto block_id = i/ellpack_size ;
494 auto local_id = i%ellpack_size ;
496 auto begin = access_kcol_buffer[i] ;
497 auto end = access_kcol_buffer[i+1] ;
498 auto row_size = end - begin ;
500 int block_row_offset = access_block_row_offset[block_id];
501 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
503 for(
int k=0;k<row_size;++k)
505 access_block_values[(block_row_offset+k)*ellpack_size+local_id] = access_values_buffer[begin+k] ;
507 for(
int k=row_size;k<block_row_size;++k)
509 access_block_values[(block_row_offset+k)*ellpack_size+local_id] = 0 ;
515 m_values_is_update =
true;
519 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz));
520 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
523 queue.submit([&](sycl::handler& cgh)
525 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
526 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
528 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
529 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
530 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
533 cgh.parallel_for<
class set_matrix_values2>(
534 sycl::range<1>{total_threads},
535 [=] (sycl::item<1> item_id)
537 auto id = item_id.get_id(0);
542 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
544 auto block_id = i/pack_size ;
545 auto local_id = i%pack_size ;
547 auto begin = access_kcol_buffer[i] ;
548 auto lrow_size = access_lrowsize_buffer[i] ;
549 auto end = begin + lrow_size ;
551 int block_row_offset = access_block_row_offset[block_id];
552 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
554 for(
int k=0;k<lrow_size;++k)
556 access_block_values[(block_row_offset+k)*pack_size+local_id] = access_values_buffer[begin+k] ;
558 for(
int k=lrow_size;k<block_row_size;++k)
560 access_block_values[(block_row_offset+k)*pack_size+local_id] = 0. ;
566 auto interface_nrows = m_ext_profile->getNRows();
567 auto ext_nnz = m_ext_profile->getNnz();
568 auto ext_block_nnz = m_ext_profile->getBlockNnz();
570 auto ext_internal_profile = m_ext_profile->internal();
571 auto& ext_kcol = ext_internal_profile->getKCol();
572 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
574 m_ext_values.reset(
new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
576 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz));
577 queue.submit([&](sycl::handler& cgh)
579 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
580 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
581 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
583 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
585 cgh.parallel_for<
class set_matrix_values3>(
586 sycl::range<1>{total_threads},
587 [=] (sycl::item<1> item_id)
589 auto id = item_id.get_id(0);
591 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
593 auto block_id = i/ellpack_size ;
594 auto local_id = i%ellpack_size ;
595 auto begin = access_kcol_buffer[i] ;
596 auto end = access_kcol_buffer[i+1] ;
597 auto row_size = end - begin ;
599 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
600 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
602 for(
int k=begin*NxN;k<end*NxN;++k)
604 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
606 for(
int k=row_size*NxN;k<block_row_size*NxN;++k)
608 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
615 m_values_is_update =
true;
620 if (local_row_size ==
nullptr)
622 assert(m_h_csr_values.size()>=nnz*NxN) ;
623 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz*NxN));
626 queue.submit([&](sycl::handler& cgh)
628 auto access_kcol_buffer = kcol.template get_access<sycl::access::mode::read>(cgh);
629 auto access_cols_buffer = block_cols.template get_access<sycl::access::mode::read>(cgh);
630 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
631 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
632 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
635 cgh.parallel_for<
class set_block_matrix_values>(
636 sycl::range<1>{total_threads},
637 [=] (sycl::item<1> item_id)
639 auto id = item_id.get_id(0);
644 for (std::size_t i =
id; i < nrows; i += item_id.get_range()[0])
646 auto block_id = i/pack_size ;
647 auto local_id = i%pack_size ;
649 int begin = access_kcol_buffer[i] ;
650 int end = access_kcol_buffer[i+1] ;
651 int row_size = end - begin ;
653 int block_row_offset = access_block_row_offset[block_id] ;
654 int block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
656 for(
int k=0;k<row_size;++k)
661 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = access_values_buffer[(begin+k)*NxN + i*N+j] ;
667 for(
int k=row_size;k<block_row_size;++k)
672 access_block_values[((block_row_offset+k)*NxN + i*N+j)*pack_size+local_id] = 0. ;
679 m_values_is_update =
true;
717 ValueBufferType values_buffer(m_h_csr_values.data(), sycl::range<1>(nnz*NxN));
718 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
721 queue.submit([&](sycl::handler& cgh)
723 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
724 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
726 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
727 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
728 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
731 cgh.parallel_for<
class set_block_matrix_values2>(
732 sycl::range<1>{total_threads},
733 [=] (sycl::item<1> item_id)
735 auto id = item_id.get_id(0);
740 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
742 auto block_id = i/pack_size ;
743 auto local_id = i%pack_size ;
745 auto begin = access_kcol_buffer[i] ;
746 auto lrow_size = access_lrowsize_buffer[i] ;
747 auto end = begin + lrow_size ;
749 auto block_row_offset = access_block_row_offset[block_id];
750 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
752 for(
int k=0;k<lrow_size;++k)
757 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = access_values_buffer[(begin+k)*NxN+i*N+j] ;
760 for(
int k=lrow_size;k<block_row_size;++k)
765 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = 0. ;
773 auto interface_nrows = m_ext_profile->getNRows();
774 auto ext_nnz = m_ext_profile->getNnz();
775 auto ext_block_nnz = m_ext_profile->getBlockNnz();
777 auto ext_internal_profile = m_ext_profile->internal();
778 auto& ext_kcol = ext_internal_profile->getKCol();
779 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
781 m_ext_values.reset(
new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
783 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz));
784 queue.submit([&](sycl::handler& cgh)
786 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
787 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
788 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
790 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
792 cgh.parallel_for<
class set_matrix_values3>(
793 sycl::range<1>{total_threads},
794 [=] (sycl::item<1> item_id)
796 auto id = item_id.get_id(0);
798 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
800 auto block_id = i/pack_size ;
801 auto local_id = i%pack_size ;
802 auto begin = access_kcol_buffer[i] ;
803 auto end = access_kcol_buffer[i+1] ;
804 auto row_size = end - begin ;
806 int block_row_offset = access_block_row_offset[block_id] ;
807 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
809 for(
int k=0;k<row_size;++k)
814 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = access_values_buffer[(begin+k)*NxN+i*N+j] ;
817 for(
int k=row_size;k<block_row_size;++k)
822 access_block_values[((block_row_offset+k)*NxN+i*N+j)*pack_size+local_id] = 0 ;
852 m_values_is_update =
true;
858 template <
typename ValueT,
int EllpackSize>
859 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValues(ValueBufferType& values_buffer)
862 auto env = SYCLEnv::instance();
863 auto& queue = env->internal()->queue();
864 auto num_groups = env->internal()->maxNumGroups();
865 auto max_work_group_size = env->internal()->maxWorkGroupSize();
866 auto total_threads = num_groups * ellpack_size;
868 int NxN = this->m_NxN;
870 auto nrows = m_profile->getNRows();
871 auto nnz = m_profile->getNnz();
872 auto block_nnz = m_profile->getBlockNnz();
874 auto internal_profile = m_profile->internal();
875 auto& kcol = internal_profile->getKCol();
876 auto& block_row_offset = internal_profile->getBlockRowOffset();
878 auto local_row_size = m_profile->localRowSize();
879 if (local_row_size ==
nullptr)
884 queue.submit([&](sycl::handler& cgh)
886 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
887 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
888 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
889 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
892 cgh.parallel_for<
class set_matrix_values4>(sycl::range<1>{total_threads},
893 [=] (sycl::item<1> item_id)
895 auto id = item_id.get_id(0);
900 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
902 auto block_id = i/ellpack_size ;
903 auto local_id = i%ellpack_size ;
905 auto begin = access_kcol_buffer[i] ;
906 auto end = access_kcol_buffer[i+1] ;
907 auto row_size = end - begin ;
909 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
910 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
912 for(
int k=begin*NxN;k<end*NxN;++k)
914 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
916 for(
int k=row_size*NxN;k<block_row_size*NxN;++k)
918 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
924 m_values_is_update =
true;
929 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
933 queue.submit([&](sycl::handler& cgh)
935 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
936 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
938 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
939 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
940 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
943 cgh.parallel_for<
class set_matrix_values5>(sycl::range<1>{total_threads},
944 [=] (sycl::item<1> item_id)
946 auto id = item_id.get_id(0);
951 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
953 auto block_id = i/ellpack_size ;
954 auto local_id = i%ellpack_size ;
956 auto begin = access_kcol_buffer[i] ;
957 auto lrow_size = access_lrowsize_buffer[i] ;
958 auto end = begin + lrow_size ;
960 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
961 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
963 for(
int k=begin*NxN;k<end*NxN;++k)
965 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
967 for(
int k=lrow_size*NxN;k<block_row_size*NxN;++k)
969 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
974 auto kcol = m_profile->kcol() ;
975 auto local_row_size = m_profile->localRowSize() ;
977 auto interface_nrows = m_ext_profile->getNRows();
978 auto ext_nnz = m_ext_profile->getNnz();
979 auto ext_block_nnz = m_ext_profile->getBlockNnz();
981 auto ext_internal_profile = m_ext_profile->internal();
982 auto& ext_kcol = ext_internal_profile->getKCol();
983 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
985 m_ext_values.reset(
new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
989 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz * NxN));
991 std::vector<Integer> h_local_row_offset(interface_nrows+1) ;
994 for(std::size_t i=0;i<interface_nrows;++i)
996 h_local_row_offset[i] = offset ;
997 offset += local_row_size[i] ;
999 h_local_row_offset[interface_nrows] = offset ;
1001 IndexBufferType local_row_offset(h_local_row_offset.data(),sycl::range<1>(interface_nrows+1)) ;
1003 queue.submit([&](sycl::handler& cgh)
1005 auto access_kcol = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1006 auto access_interface_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
1007 auto access_local_row_size = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1008 auto access_local_row_offset = local_row_offset.template get_access<sycl::access::mode::read>(cgh);
1009 auto access_csr_values = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1010 auto access_ext_csr_values = ext_csr_values_buffer.template get_access<sycl::access::mode::read_write>(cgh);
1011 cgh.parallel_for<
class set_matrix_values6>(sycl::range<1>{total_threads},
1012 [=] (sycl::item<1> item_id)
1014 auto id = item_id.get_id(0);
1015 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
1017 Integer jcol = access_local_row_offset[i] ;
1018 for (
int k = (access_kcol[i] + access_local_row_size[i])*NxN; k < access_kcol[i + 1] * NxN; ++k)
1019 access_ext_csr_values[jcol++] = access_csr_values[k];
1024 queue.submit([&](sycl::handler& cgh)
1026 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
1027 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1028 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
1030 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
1032 cgh.parallel_for<
class set_matrix_values7>(sycl::range<1>{total_threads},
1033 [=] (sycl::item<1> item_id)
1035 auto id = item_id.get_id(0);
1037 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
1039 auto block_id = i/ellpack_size ;
1040 auto local_id = i%ellpack_size ;
1041 auto begin = access_kcol_buffer[i] ;
1042 auto end = access_kcol_buffer[i+1] ;
1043 auto row_size = end - begin ;
1045 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1046 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1048 for(
int k=begin*NxN;k<end*NxN;++k)
1050 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1052 for(
int k=row_size*NxN;k<block_row_size*NxN;++k)
1054 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1061 m_values_is_update =
true;
1066 template <
typename ValueT,
int EllpackSize>
1067 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValues(ValueBufferType& values_buffer,
1068 ValueBufferType& ext_values_buffer)
1071 auto env = SYCLEnv::instance();
1072 auto& queue = env->internal()->queue();
1073 auto num_groups = env->internal()->maxNumGroups();
1074 auto max_work_group_size = env->internal()->maxWorkGroupSize();
1075 auto total_threads = num_groups * ellpack_size;
1077 int NxN = this->m_NxN;
1079 auto nrows = m_profile->getNRows();
1080 auto nnz = m_profile->getNnz();
1081 auto block_nnz = m_profile->getBlockNnz();
1083 auto internal_profile = m_profile->internal();
1084 auto& kcol = internal_profile->getKCol();
1085 auto& block_row_offset = internal_profile->getBlockRowOffset();
1087 auto local_row_size = m_profile->localRowSize();
1088 if (local_row_size ==
nullptr)
1093 queue.submit([&](sycl::handler& cgh)
1095 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1096 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1097 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1098 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1101 cgh.parallel_for<
class set_matrix_values4>(sycl::range<1>{total_threads},
1102 [=] (sycl::item<1> item_id)
1104 auto id = item_id.get_id(0);
1109 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
1111 auto block_id = i/ellpack_size ;
1112 auto local_id = i%ellpack_size ;
1114 auto begin = access_kcol_buffer[i] ;
1115 auto end = access_kcol_buffer[i+1] ;
1116 auto row_size = end - begin ;
1118 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1119 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1121 for(
int k=begin*NxN;k<end*NxN;++k)
1123 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1125 for(
int k=row_size*NxN;k<block_row_size*NxN;++k)
1127 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1133 m_values_is_update =
true;
1138 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
1142 queue.submit([&](sycl::handler& cgh)
1144 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1145 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1147 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1148 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1149 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1152 cgh.parallel_for<
class set_matrix_values5>(sycl::range<1>{total_threads},
1153 [=] (sycl::item<1> item_id)
1155 auto id = item_id.get_id(0);
1160 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
1162 auto block_id = i/ellpack_size ;
1163 auto local_id = i%ellpack_size ;
1165 auto begin = access_kcol_buffer[i] ;
1166 auto lrow_size = access_lrowsize_buffer[i] ;
1167 auto end = begin + lrow_size ;
1169 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1170 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1172 for(
int k=begin*NxN;k<end*NxN;++k)
1174 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1176 for(
int k=lrow_size*NxN;k<block_row_size*NxN;++k)
1178 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1183 auto kcol = m_profile->kcol() ;
1184 auto local_row_size = m_profile->localRowSize() ;
1186 auto interface_nrows = m_ext_profile->getNRows();
1187 auto ext_nnz = m_ext_profile->getNnz();
1188 auto ext_block_nnz = m_ext_profile->getBlockNnz();
1190 auto ext_internal_profile = m_ext_profile->internal();
1191 auto& ext_kcol = ext_internal_profile->getKCol();
1192 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
1194 m_ext_values.reset(
new ValueBufferType(ext_block_nnz * ellpack_size * NxN)) ;
1198 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz * NxN));
1200 std::vector<Integer> h_local_row_offset(interface_nrows+1) ;
1202 Integer offset = 0 ;
1203 for(std::size_t i=0;i<interface_nrows;++i)
1205 h_local_row_offset[i] = offset ;
1206 offset += local_row_size[i] ;
1208 h_local_row_offset[interface_nrows] = offset ;
1210 IndexBufferType local_row_offset(h_local_row_offset.data(),sycl::range<1>(interface_nrows+1)) ;
1212 queue.submit([&](sycl::handler& cgh)
1214 auto access_kcol = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1215 auto access_interface_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
1216 auto access_local_row_size = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1217 auto access_local_row_offset = local_row_offset.template get_access<sycl::access::mode::read>(cgh);
1218 auto access_csr_values = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1219 auto access_ext_csr_values = ext_csr_values_buffer.template get_access<sycl::access::mode::read_write>(cgh);
1220 cgh.parallel_for<
class set_matrix_values6>(sycl::range<1>{total_threads},
1221 [=] (sycl::item<1> item_id)
1223 auto id = item_id.get_id(0);
1224 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
1226 Integer jcol = access_local_row_offset[i] ;
1227 for (
int k = (access_kcol[i] + access_local_row_size[i])*NxN; k < access_kcol[i + 1] * NxN; ++k)
1228 access_ext_csr_values[jcol++] = access_csr_values[k];
1233 queue.submit([&](sycl::handler& cgh)
1235 auto access_kcol_buffer = ext_kcol.template get_access<sycl::access::mode::read>(cgh);
1236 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1237 auto access_values_buffer = ext_csr_values_buffer.template get_access<sycl::access::mode::read>(cgh);
1239 auto access_block_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
1241 cgh.parallel_for<
class set_matrix_values7>(sycl::range<1>{total_threads},
1242 [=] (sycl::item<1> item_id)
1244 auto id = item_id.get_id(0);
1246 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
1248 auto block_id = i/ellpack_size ;
1249 auto local_id = i%ellpack_size ;
1250 auto begin = access_kcol_buffer[i] ;
1251 auto end = access_kcol_buffer[i+1] ;
1252 auto row_size = end - begin ;
1254 int block_row_offset = access_block_row_offset[block_id]*ellpack_size*NxN ;
1255 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1257 for(
int k=begin*NxN;k<end*NxN;++k)
1259 access_block_values[block_row_offset+(k-begin*NxN)*ellpack_size+local_id] = access_values_buffer[k] ;
1261 for(
int k=row_size*NxN;k<block_row_size*NxN;++k)
1263 access_block_values[block_row_offset+k*ellpack_size+local_id] = 0 ;
1270 m_values_is_update =
true;
1275 template <
typename ValueT,
int EllpackSize>
1276 bool MatrixInternal<ValueT, EllpackSize>::setMatrixValues(ValueT
const* values,
bool only_host)
1280 int NxN = this->m_NxN;
1282 auto nnz = m_profile->getNnz();
1283 m_h_csr_values.resize(nnz*NxN);
1284 std::copy(values, values + nnz*NxN, m_h_csr_values.begin());
1285 if (m_ext_profile) {
1287 auto kcol = m_profile->kcol() ;
1288 auto local_row_size = m_profile->localRowSize() ;
1290 auto interface_nrows = m_ext_profile->getNRows() ;
1291 auto ext_nnz = m_ext_profile->getNnz();
1294 m_h_csr_ext_values.resize(ext_nnz*NxN);
1299 for (std::size_t i = 0; i < interface_nrows; ++i) {
1300 auto id = m_h_interface_row_ids[i];
1301 for (
int k = (kcol[
id] + local_row_size[
id])*NxN; k < kcol[
id + 1]*NxN; ++k) {
1302 m_h_csr_ext_values[jcol++] = values[k];
1308 m_values_is_update =
false;
1311 setMatrixValuesFromHost();
1312 m_values_is_update =
true;
1317 template <
typename ValueT,
int EllpackSize>
1318 bool MatrixInternal<ValueT, EllpackSize>::copy(std::size_t nb_blocks,
1320 ValueBufferType& values_buffer,
1323 auto env = SYCLEnv::instance();
1324 auto& queue = env->internal()->queue();
1325 auto num_groups = env->internal()->maxNumGroups();
1326 auto max_work_group_size = env->internal()->maxWorkGroupSize();
1327 auto total_threads = num_groups * ellpack_size;
1329 int pack_size = ellpack_size;
1330 int N1 = block_size ;
1332 int N2 = rhs_block_size ;
1334 auto nrows = m_profile->getNRows();
1335 auto nnz = m_profile->getNnz();
1336 auto block_nnz = m_profile->getBlockNnz();
1338 auto internal_profile = m_profile->internal();
1339 auto& kcol = internal_profile->getKCol();
1340 auto& block_row_offset = internal_profile->getBlockRowOffset();
1342 auto local_row_size = m_profile->localRowSize();
1343 if (local_row_size ==
nullptr)
1348 queue.submit([&](sycl::handler& cgh)
1350 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1351 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1352 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1353 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1356 cgh.parallel_for<
class set_matrix_values4>(
1357 sycl::range<1>{total_threads},
1358 [=] (sycl::item<1> item_id)
1360 auto id = item_id.get_id(0);
1365 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
1367 auto block_id = i/pack_size ;
1368 auto local_id = i%pack_size ;
1370 auto begin = access_block_row_offset[block_id] ;
1371 auto end = access_block_row_offset[block_id+1] ;
1373 for(
int k=begin;k<end;++k)
1375 for(
int i=0;i<N1;++i)
1376 for(
int j=0;j<N1;++j)
1377 access_block_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = access_values_buffer[(k*N2xN2+i*N2+j)*pack_size+local_id] ;
1387 IndexBufferType lrowsize_buffer(local_row_size, sycl::range<1>(nrows));
1391 queue.submit([&](sycl::handler& cgh)
1393 auto access_kcol_buffer = internal_profile->getKCol().template get_access<sycl::access::mode::read>(cgh);
1394 auto access_lrowsize_buffer = lrowsize_buffer.template get_access<sycl::access::mode::read>(cgh);
1396 auto access_block_row_offset = internal_profile->getBlockRowOffset().template get_access<sycl::access::mode::read>(cgh);
1397 auto access_values_buffer = values_buffer.template get_access<sycl::access::mode::read>(cgh);
1398 auto access_block_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
1401 cgh.parallel_for<
class set_matrix_values5>(
1402 sycl::range<1>{total_threads},
1403 [=] (sycl::item<1> item_id)
1405 auto id = item_id.get_id(0);
1410 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
1412 auto block_id = i/pack_size ;
1413 auto local_id = i%pack_size ;
1416 int block_row_offset = access_block_row_offset[block_id] ;
1418 auto begin = access_block_row_offset[block_id] ;
1419 auto end = access_block_row_offset[block_id+1] ;
1420 auto lrow_size = access_lrowsize_buffer[i] ;
1422 for(
int k=begin;k<begin+lrow_size;++k)
1424 for(
int i=0;i<N1;++i)
1425 for(
int j=0;j<N1;++j)
1426 access_block_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = access_values_buffer[(k*N2xN2+i*N2+j)*pack_size+local_id] ;
1428 for(
int k=begin+lrow_size;k<end;++k)
1430 for(
int i=0;i<N1;++i)
1431 for(
int j=0;j<N1;++j)
1432 access_block_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = 0 ;
1439 m_values_is_update =
true;
1443 template <
typename ValueT,
int EllpackSize>
1444 bool MatrixInternal<ValueT, EllpackSize>::copy(std::size_t nb_blocks,
1446 ValueBufferType& values_buffer,
1447 ValueBufferType& ext_values_buffer,
1450 copy(nb_blocks,block_size,values_buffer,rhs_block_size) ;
1452 auto env = SYCLEnv::instance();
1453 auto& queue = env->internal()->queue();
1454 auto num_groups = env->internal()->maxNumGroups();
1455 auto max_work_group_size = env->internal()->maxWorkGroupSize();
1456 auto total_threads = num_groups * ellpack_size;
1458 int pack_size = ellpack_size;
1459 int N1 = block_size ;
1461 int N2 = rhs_block_size ;
1463 auto nrows = m_profile->getNRows();
1464 auto nnz = m_profile->getNnz();
1465 auto block_nnz = m_profile->getBlockNnz();
1467 auto internal_profile = m_profile->internal();
1468 auto& kcol = internal_profile->getKCol();
1469 auto& block_row_offset = internal_profile->getBlockRowOffset();
1471 assert(m_ext_profile) ;
1472 auto interface_nrows = m_ext_profile->getNRows();
1473 auto ext_nnz = m_ext_profile->getNnz();
1474 auto ext_block_nnz = m_ext_profile->getBlockNnz();
1475 auto ext_internal_profile = m_ext_profile->internal();
1476 auto& ext_kcol = ext_internal_profile->getKCol();
1477 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
1479 m_h_ext_values.resize(ext_block_nnz*N1xN1) ;
1480 m_ext_values.reset(
new ValueBufferType(ext_block_nnz * N1xN1* ellpack_size)) ;
1482 ValueBufferType ext_csr_values_buffer(m_h_csr_ext_values.data(), sycl::range<1>(ext_nnz*N1xN1));
1483 queue.submit([&](sycl::handler& cgh)
1485 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1486 auto access_ext_values_buffer = ext_values_buffer.template get_access<sycl::access::mode::read>(cgh);
1487 auto access_ext_values = sycl::accessor { *m_ext_values, cgh, sycl::write_only, sycl::property::no_init{}};
1489 cgh.parallel_for<
class set_matrix_ext_values3>(
1490 sycl::range<1>{total_threads},
1491 [=] (sycl::item<1> item_id)
1493 auto id = item_id.get_id(0);
1494 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
1496 auto block_id = i/pack_size ;
1497 auto local_id = i%pack_size ;
1499 auto begin = access_block_row_offset[block_id] ;
1500 auto end = access_block_row_offset[block_id+1] ;
1502 for(
int k=begin;k<begin;++k)
1504 for(
int i=0;i<N1;++i)
1505 for(
int j=0;j<N1;++j)
1506 access_ext_values[(k*N1xN1 +i*N1+j)*pack_size+local_id] = access_ext_values_buffer[(k*N2xN2+i*N2+j)*pack_size+local_id] ;
1513 m_values_is_update =
true;
1518 template <
typename ValueT,
int EllpackSize>
1519 bool MatrixInternal<ValueT, EllpackSize>::needUpdate()
1521 return m_values_is_update !=
true;
1524 template <
typename ValueT,
int EllpackSize>
1525 void MatrixInternal<ValueT, EllpackSize>::notifyChanges()
1527 m_values_is_update =
false;
1530 template <
typename ValueT,
int EllpackSize>
1531 void MatrixInternal<ValueT, EllpackSize>::endUpdate()
1533 if (not m_values_is_update) {
1534 setMatrixValuesFromHost();
1615 template <
typename ValueT,
int EllPackSize>
1616 void MatrixInternal<ValueT, EllPackSize>::mult(ValueBufferType& x, ValueBufferType& y, sycl::queue& queue)
const
1621 auto device = queue.get_device();
1623 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1625 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1631 std::size_t pack_size = ellpack_size;
1632 auto nrows = m_profile->getNRows();
1633 auto nnz = m_profile->getNnz();
1635 auto internal_profile = m_profile->internal();
1636 auto& kcol = internal_profile->getKCol();
1637 auto& block_row_offset = internal_profile->getBlockRowOffset();
1638 auto& block_cols = internal_profile->getBlockCols();
1640 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
1641 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
1642 auto total_threads = blocks_target * ellpack_size;
1646 queue.submit([&](sycl::handler& cgh)
1648 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1649 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1650 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
1653 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1654 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
1658 cgh.parallel_for<
class compute_mult>(
1659 sycl::range<1>{total_threads},
1660 [=] (sycl::item<1> item_id)
1662 auto id = item_id.get_id(0);
1667 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
1669 auto block_id = i/ellpack_size ;
1670 auto local_id = i%ellpack_size ;
1671 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
1672 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1674 ValueType value = 0. ;
1675 for(
int j=0;j<block_row_size;++j)
1677 auto k = block_row_offset+j*ellpack_size+local_id ;
1678 value += access_values[k]* access_x[access_cols[k]] ;
1680 access_y[i] = value ;
1685 sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
1686 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
1687 cgh.parallel_for<
class compute_mult>(r,
1688 [=](sycl::nd_item<1> item_id)
1691 auto local_id = item_id.get_local_id(0);
1693 auto global_id = item_id.get_global_id(0);
1696 for (
auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
1698 auto block_id = i/pack_size ;
1701 int begin = access_block_row_offset[block_id] ;
1702 int end = access_block_row_offset[block_id+1] ;
1703 ValueType value = 0. ;
1704 for(
int k=begin;k<end;++k)
1707 const int col = access_cols[k * pack_size + local_id];
1709 lds_x[local_id] = access_x[col];
1710 item_id.barrier(sycl::access::fence_space::local_space);
1712 value += access_values[k * pack_size + local_id] * lds_x[local_id] ;
1713 item_id.barrier(sycl::access::fence_space::local_space);
1715 access_y[i] = value ;
1726 multN<2>(x,y,queue) ;
1729 multN<3>(x,y,queue) ;
1732 multN<4>(x,y,queue) ;
1735 auto device = queue.get_device();
1737 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1739 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1741 auto total_threads = num_groups * ellpack_size;
1746 std::size_t pack_size = ellpack_size;
1747 auto nrows = m_profile->getNRows();
1748 auto nnz = m_profile->getNnz();
1750 auto internal_profile = m_profile->internal();
1751 auto& kcol = internal_profile->getKCol();
1752 auto& block_row_offset = internal_profile->getBlockRowOffset();
1753 auto& block_cols = internal_profile->getBlockCols();
1756 [&](sycl::handler& cgh)
1758 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1759 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1760 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
1763 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1764 auto access_y = y.template get_access<sycl::access::mode::discard_write>(cgh);
1768 sycl::range<1> range{total_threads} ;
1769 auto tile = Tile(N) ;
1771 cgh.parallel_for<
class compute_block_mult>(range,
1772 [=] (sycl::item<1> item_id )
1774 auto id = item_id.get_id(0);
1781 for (
auto i =
id; i < nrows; i += item_id.get_range()[0] )
1783 auto block_id = i/pack_size ;
1784 auto local_id = i%pack_size ;
1786 std::size_t block_row_offset = access_block_row_offset[block_id] ;
1787 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1789 for(
int ieq=0;ieq<N;++ieq)
1791 ValueType value = 0. ;
1792 for(std::size_t k=block_row_offset;k<block_row_offset+block_row_size;++k)
1795 value += tile.mult(ieq,
1802 access_y[i*N+ieq] = value;
1857 template <
typename ValueT,
int EllPackSize>
1858 void MatrixInternal<ValueT, EllPackSize>::mult(ValueBufferType& x, ValueBufferType& y)
const
1860 this->mult(x, y, SYCLEnv::instance()->internal()->queue());
1863 template <
typename ValueT,
int EllPackSize>
1864 void MatrixInternal<ValueT, EllPackSize>::addExtMult(ValueBufferType& x,
1866 sycl::queue& queue)
const
1868 auto device = queue.get_device();
1870 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1872 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1874 auto total_threads = num_groups * ellpack_size;
1877 auto nrows = m_profile->getNRows();
1878 auto nnz = m_profile->getNnz();
1880 auto interface_nrow = m_ext_profile->getNRows();
1881 auto ext_profile = m_ext_profile->internal();
1882 auto& kcol = ext_profile->getKCol();
1883 auto& block_row_offset = ext_profile->getBlockRowOffset();
1884 auto& block_cols = ext_profile->getBlockCols();
1889 queue.submit([&](sycl::handler& cgh)
1891 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1892 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1893 auto access_values = m_ext_values->template get_access<sycl::access::mode::read>(cgh);
1895 auto access_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
1897 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1898 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
1900 cgh.parallel_for<
class compute_ext_mult>(
1901 sycl::range<1>{total_threads},
1902 [=] (sycl::item<1> item_id)
1904 auto id = item_id.get_id(0);
1906 for (
auto i =
id; i < interface_nrow; i += item_id.get_range()[0])
1908 auto block_id = i/ellpack_size ;
1909 auto local_id = i%ellpack_size ;
1911 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
1912 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
1914 ValueType value = 0. ;
1915 for(
int j=0;j<block_row_size;++j)
1917 auto k = block_row_offset+j*ellpack_size+local_id ;
1918 value += access_values[k]* access_x[access_cols[k]] ;
1920 access_y[access_row_ids[i]] += value ;
1927 template <
typename ValueT,
int EllPackSize>
1928 void MatrixInternal<ValueT, EllPackSize>::addExtMult(ValueBufferType& x,
1929 ValueBufferType& y)
const
1931 this->addExtMult(x, y, SYCLEnv::instance()->internal()->queue());
1934 template <
typename ValueT,
int EllPackSize>
1935 void MatrixInternal<ValueT, EllPackSize>::addLMult(ValueType alpha,
1938 sycl::queue& queue)
const
1943 auto device = queue.get_device();
1945 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
1947 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
1949 std::size_t pack_size = ellpack_size;
1950 auto nrows = m_profile->getNRows();
1951 auto nnz = m_profile->getNnz();
1953 auto internal_profile = m_profile->internal();
1954 auto& kcol = internal_profile->getKCol();
1955 auto& block_row_offset = internal_profile->getBlockRowOffset();
1956 auto& block_cols = internal_profile->getBlockCols();
1958 auto& mask = internal_profile->getLowerMask();
1962 [&](sycl::handler& cgh)
1964 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
1965 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
1966 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
1967 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
1970 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
1971 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
1973 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
1974 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
1975 auto total_threads = blocks_target * ellpack_size;
1977 sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
1978 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
1980 cgh.parallel_for<
class compute_lmult>(r,
1981 [=](sycl::nd_item<1> item_id)
1984 auto local_id = item_id.get_local_id(0);
1986 auto global_id = item_id.get_global_id(0);
1989 for (
auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
1991 auto block_id = i/pack_size ;
1994 int begin = access_block_row_offset[block_id] ;
1995 int end = access_block_row_offset[block_id+1] ;
1997 ValueType value = access_y[i] ;
1998 for(
int k=begin;k<end;++k)
2001 const int col = access_cols[k * pack_size + local_id];
2003 lds_x[local_id] = access_x[col];
2004 item_id.barrier(sycl::access::fence_space::local_space);
2005 if(access_mask[k * pack_size + local_id])
2006 value += alpha * access_values[k * pack_size + local_id] * lds_x[local_id] ;
2007 item_id.barrier(sycl::access::fence_space::local_space);
2009 access_y[i] = value ;
2019 addLMultN<2>(alpha,x,y,queue);
2022 addLMultN<3>(alpha,x,y,queue);
2025 addLMultN<4>(alpha,x,y,queue);
2029 auto device = queue.get_device();
2031 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2033 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2034 auto total_threads = num_groups * ellpack_size;
2037 int NxN = this->m_NxN;
2038 std::size_t pack_size = ellpack_size;
2039 auto nrows = m_profile->getNRows();
2040 auto nnz = m_profile->getNnz();
2042 auto internal_profile = m_profile->internal();
2043 auto& kcol = internal_profile->getKCol();
2044 auto& block_row_offset = internal_profile->getBlockRowOffset();
2045 auto& block_cols = internal_profile->getBlockCols();
2047 auto& mask = internal_profile->getLowerMask();
2051 [&](sycl::handler& cgh)
2053 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2054 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2055 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
2056 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2058 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
2059 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2060 sycl::range<1> range{total_threads} ;
2061 auto tile = Tile(N) ;
2062 cgh.parallel_for<
class compute_block_lmult>(
2063 sycl::range<1>{total_threads},
2064 [=] (sycl::item<1> item_id)
2066 auto id = item_id.get_id(0);
2071 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
2073 auto block_id = i/ellpack_size ;
2074 auto local_id = i%ellpack_size ;
2076 int block_row_offset = access_block_row_offset[block_id] ;
2077 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2079 for(
int ieq=0;ieq<N;++ieq)
2081 ValueType value = 0. ;
2082 for(std::size_t k=block_row_offset;k<block_row_offset+block_row_size;++k)
2085 value += tile.mult(ieq,
2093 access_y[i*N+ieq] += alpha*value;
2103#ifdef PRINT_DEBUG_INFO
2105 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(m_values);
2106 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2107 sycl::host_accessor<uint8_t, 1, sycl::access::mode::read> mask_acc(mask) ;
2108 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> x_acc(x);
2109 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> y_acc(y);
2110 for(
int irow=0;irow<nrows;++irow)
2112 std::cout<<
"XK["<<irow<<
"]=\n";
2113 for(
int i=0;i<N;++i)
2114 std::cout<<x_acc[irow*N+i]<<
"\n";
2116 for(
int irow=0;irow<nrows;++irow)
2118 auto ib=irow/ellpack_size ;
2119 auto il=irow%ellpack_size ;
2120 std::cout<<
"LSOLVE MAT["<<irow<<
"]:"<<std::endl;
2121 for(
int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2123 if(mask_acc[k*ellpack_size+il])
2125 for(
int i=0;i<N;++i)
2127 for(
int j=0;j<N;++j)
2128 std::cout<<h_acc[(k*NxN+i*N+j)*ellpack_size+il]<<
" ";
2129 std::cout<<std::endl;
2133 std::cout<<
"Y ="<<std::endl;
2134 for(
int i=0;i<N;++i)
2135 std::cout<<y_acc[irow*N+i]<<
"\n";
2141 template <
typename ValueT,
int EllPackSize>
2142 void MatrixInternal<ValueT, EllPackSize>::addLMult(ValueType alpha, ValueBufferType& x, ValueBufferType& y)
const
2144 this->addLMult(alpha, x, y, SYCLEnv::instance()->internal()->queue());
2147 template <
typename ValueT,
int EllPackSize>
2148 void MatrixInternal<ValueT, EllPackSize>::addUMult(ValueType alpha,
2151 sycl::queue& queue)
const
2155 auto device = queue.get_device();
2157 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2158 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2162 int NxN = this->m_NxN;
2163 std::size_t pack_size = ellpack_size;
2164 auto nrows = m_profile->getNRows() ;
2165 auto nnz = m_profile->getNnz() ;
2167 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
2168 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
2169 auto total_threads = blocks_target * ellpack_size;
2171 auto internal_profile = m_profile->internal() ;
2172 auto& kcol = internal_profile->getKCol() ;
2173 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2174 auto& block_cols = internal_profile->getBlockCols() ;
2175 auto& mask = internal_profile->getUpperMask() ;
2179 [&](sycl::handler& cgh)
2181 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2182 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2183 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
2184 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2187 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
2188 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2190 sycl::local_accessor<ValueType, 1> lds_x{pack_size, cgh};
2191 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
2193 cgh.parallel_for<
class compute_umult>(r,
2194 [=](sycl::nd_item<1> item_id)
2197 auto local_id = item_id.get_local_id(0);
2199 auto global_id = item_id.get_global_id(0);
2201 for (
auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
2203 auto block_id = i/pack_size ;
2206 auto begin = access_block_row_offset[block_id] ;
2207 auto end = access_block_row_offset[block_id+1] ;
2210 ValueType value = access_y[i] ;
2211 for(
int k=begin;k<end;++k)
2215 const int col = access_cols[k * pack_size + local_id];
2217 lds_x[local_id] = access_x[col];
2218 item_id.barrier(sycl::access::fence_space::local_space);
2219 if(access_mask[k * pack_size + local_id])
2220 value += alpha * access_values[k * pack_size + local_id] * lds_x[local_id] ;
2221 item_id.barrier(sycl::access::fence_space::local_space);
2223 access_y[i] = value ;
2233 addUMultN<2>(alpha,x,y,queue);
2236 addUMultN<3>(alpha,x,y,queue);
2239 addUMultN<4>(alpha,x,y,queue);
2243 auto device = queue.get_device();
2245 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2246 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2247 auto total_threads = num_groups * ellpack_size;
2251 int NxN = this->m_NxN;
2252 std::size_t pack_size = ellpack_size;
2253 auto nrows = m_profile->getNRows() ;
2254 auto nnz = m_profile->getNnz() ;
2256 auto internal_profile = m_profile->internal() ;
2257 auto& kcol = internal_profile->getKCol() ;
2258 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2259 auto& block_cols = internal_profile->getBlockCols() ;
2260 auto& mask = internal_profile->getUpperMask() ;
2264 [&](sycl::handler& cgh)
2266 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2267 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2268 auto access_mask = mask.template get_access<sycl::access::mode::read>(cgh);
2269 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2272 auto access_x = x.template get_access<sycl::access::mode::read>(cgh);
2273 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2275 auto tile = Tile(N) ;
2276 cgh.parallel_for<
class compute_block_umult>(
2277 sycl::range<1>{total_threads},
2278 [=] (sycl::item<1> item_id)
2280 auto id = item_id.get_id(0);
2285 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
2287 auto block_id = i/ellpack_size ;
2288 auto local_id = i%ellpack_size ;
2290 int block_row_offset = access_block_row_offset[block_id] ;
2291 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2292 for(
int ieq=0;ieq<N;++ieq)
2294 ValueType value = 0. ;
2295 for(std::size_t k=block_row_offset;k<block_row_offset+block_row_size;++k)
2298 value += tile.mult(ieq,
2306 access_y[i*N+ieq] += alpha*value;
2317#ifdef PRINT_DEBUG_INFO
2319 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(m_values);
2320 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2321 sycl::host_accessor<uint8_t, 1, sycl::access::mode::read> mask_acc(mask) ;
2322 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> x_acc(x);
2323 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> y_acc(y);
2324 for(
int irow=0;irow<nrows;++irow)
2326 std::cout<<
"XK["<<irow<<
"]=\n";
2327 for(
int i=0;i<N;++i)
2328 std::cout<<x_acc[irow*N+i]<<
"\n";
2330 for(
int irow=0;irow<nrows;++irow)
2332 auto ib=irow/ellpack_size ;
2333 auto il=irow%ellpack_size ;
2334 std::cout<<
"USOLVE MAT["<<irow<<
"]:"<<std::endl;
2335 for(
int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2337 if(mask_acc[k*ellpack_size+il])
2339 for(
int i=0;i<N;++i)
2341 for(
int j=0;j<N;++j)
2342 std::cout<<h_acc[(k*NxN+i*N+j)*ellpack_size+il]<<
" ";
2343 std::cout<<std::endl;
2347 std::cout<<
"Y ="<<std::endl;
2348 for(
int i=0;i<N;++i)
2349 std::cout<<y_acc[irow*N+i]<<
"\n";
2355 template <
typename ValueT,
int EllPackSize>
2356 void MatrixInternal<ValueT, EllPackSize>::addUMult(ValueType alpha, ValueBufferType& x, ValueBufferType& y)
const
2358 this->addUMult(alpha, x, y, SYCLEnv::instance()->internal()->queue());
2361 template <
typename ValueT,
int EllPackSize>
2362 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& x, ValueBufferType& y, sycl::queue& queue)
const
2366 template <
typename ValueT,
int EllPackSize>
2367 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& x, ValueBufferType& y)
const
2369 this->multDiag(x, y, SYCLEnv::instance()->internal()->queue());
2372 template <
typename ValueT,
int EllPackSize>
2373 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& y, sycl::queue& queue)
const
2375 throw NotImplementedException(
2376 A_FUNCINFO,
"MatrixInternal<ValueT, EllPackSize>::multDiag not implemented");
2379 template <
typename ValueT,
int EllPackSize>
2380 void MatrixInternal<ValueT, EllPackSize>::multDiag(ValueBufferType& y)
const
2382 this->multDiag(y, SYCLEnv::instance()->internal()->queue());
2385 template <
typename ValueT,
int EllPackSize>
2386 void MatrixInternal<ValueT, EllPackSize>::computeDiag(ValueBufferType& y, sycl::queue& queue)
const
2389 auto device = queue.get_device();
2391 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2393 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2395 auto total_threads = num_groups * ellpack_size;
2399 int NxN = this->m_NxN;
2400 int pack_size = ellpack_size ;
2401 auto nrows = m_profile->getNRows() ;
2402 auto nnz = m_profile->getNnz() ;
2404 auto internal_profile = m_profile->internal() ;
2405 auto& kcol = internal_profile->getKCol() ;
2406 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2407 auto& block_cols = internal_profile->getBlockCols() ;
2410 queue.submit([&](sycl::handler& cgh)
2412 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2413 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2414 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2415 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2418 cgh.parallel_for<
class compute_diag>(
2419 sycl::range<1>{total_threads},
2420 [=] (sycl::item<1> item_id)
2422 auto id = item_id.get_id(0);
2427 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
2429 auto block_id = i/pack_size ;
2430 auto local_id = i%pack_size ;
2432 auto begin = access_block_row_offset[block_id] ;
2433 auto end = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2434 for(
int k=begin;k<end;++k)
2436 auto kcol = k*pack_size+local_id ;
2437 if((access_cols[kcol])==
int(i))
2438 access_y[i] = access_values[kcol] ;
2445 cgh.parallel_for<
class compute_diagN>(
2446 sycl::range<1>{total_threads},
2447 [=] (sycl::item<1> item_id)
2449 auto id = item_id.get_id(0);
2454 for (std::size_t i =
id; i < nrows; i += item_id.get_range()[0])
2456 auto block_id = i/pack_size ;
2457 auto local_id = i%pack_size ;
2459 int block_row_offset = access_block_row_offset[block_id] ;
2460 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2461 for(
int k=0;k<block_row_size;++k)
2463 auto kcol = (block_row_offset+k)*pack_size+local_id ;
2464 std::size_t col = access_cols[kcol] ;
2467 for(
int ieq=0;ieq<N;++ieq)
2469 auto kval = (block_row_offset+k)*NxN + ieq*N + ieq ;
2470 access_y[i*N+ieq] = access_values[kval*pack_size+local_id] ;
2509 template <
typename ValueT,
int EllPackSize>
2510 void MatrixInternal<ValueT, EllPackSize>::computeBlockDiag(ValueBufferType& y, sycl::queue& queue)
const
2513 auto device = queue.get_device();
2515 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2517 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2519 auto total_threads = num_groups * ellpack_size;
2523 int NxN = this->m_NxN;
2524 int pack_size = ellpack_size ;
2525 auto nrows = m_profile->getNRows() ;
2526 auto nnz = m_profile->getNnz() ;
2528 assert(y.size()>=NxN*nrows) ;
2530 auto internal_profile = m_profile->internal() ;
2531 auto& kcol = internal_profile->getKCol() ;
2532 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2533 auto& block_cols = internal_profile->getBlockCols() ;
2535 ValueBufferType buffer (sycl::range<1>(nrows*NxN)) ;
2537 queue.submit([&](sycl::handler& cgh)
2539 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2540 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2541 auto matrix = m_values.template get_access<sycl::access::mode::read>(cgh);
2542 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2543 auto A = buffer.template get_access<sycl::access::mode::read_write>(cgh);
2545 auto tile = Tile(N) ;
2546 cgh.parallel_for<
class compute_block_diag>(
2547 sycl::range<1>{total_threads},
2548 [=] (sycl::item<1> item_id)
2550 auto id = item_id.get_id(0);
2555 for (std::size_t global_id =
id; global_id < nrows; global_id += item_id.get_range()[0])
2557 auto block_id = global_id/pack_size ;
2558 auto local_id = global_id%pack_size ;
2560 int block_row_offset = access_block_row_offset[block_id] ;
2561 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
2562 for(
int k=0;k<block_row_size;++k)
2564 auto kcol = (block_row_offset+k)*ellpack_size+local_id ;
2565 std::size_t col = access_cols[kcol] ;
2569 for(
int i=0;i<N;++i)
2570 for(
int j=0;j<N;++j)
2571 A[tile._ij(local_id,i,j)] = matrix[tile._ijk(k,i,j)+local_id] ;
2581 template <
typename ValueT,
int EllPackSize>
2582 void MatrixInternal<ValueT, EllPackSize>::computeDiag(ValueBufferType& y)
const
2584 this->computeDiag(y, SYCLEnv::instance()->internal()->queue());
2587 template <
typename ValueT,
int EllPackSize>
2588 void MatrixInternal<ValueT, EllPackSize>::computeBlockDiag(ValueBufferType& y)
const
2590 this->computeBlockDiag(y, SYCLEnv::instance()->internal()->queue());
2594 template <
typename ValueT,
int EllPackSize>
2595 void MatrixInternal<ValueT, EllPackSize>::multInvDiag(ValueBufferType& y, sycl::queue& queue)
const
2597 throw NotImplementedException(
2598 A_FUNCINFO,
"MatrixInternal<ValueT, EllPackSize>::multInvDiag not implemented");
2601 template <
typename ValueT,
int EllPackSize>
2602 void MatrixInternal<ValueT, EllPackSize>::multInvDiag(ValueBufferType& y)
const
2604 this->multInvDiag(y, SYCLEnv::instance()->internal()->queue());
2607 template <
typename ValueT,
int EllPackSize>
2608 void MatrixInternal<ValueT, EllPackSize>::computeInvDiag(ValueBufferType& y, sycl::queue& queue)
const
2611 auto device = queue.get_device();
2613 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2615 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2619 int NxN = this->m_NxN;
2620 std::size_t pack_size = ellpack_size ;
2621 auto nrows = m_profile->getNRows() ;
2622 auto nnz = m_profile->getNnz() ;
2624 auto internal_profile = m_profile->internal() ;
2625 auto& kcol = internal_profile->getKCol() ;
2626 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2627 auto& block_cols = internal_profile->getBlockCols() ;
2629 auto blocks_needed = (nrows + ellpack_size - 1) / ellpack_size;
2630 auto blocks_target = std::max(blocks_needed, num_groups * 4UL);
2631 auto total_threads = blocks_target * ellpack_size;
2634 queue.submit([&](sycl::handler& cgh)
2636 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2637 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2638 auto access_values = m_values.template get_access<sycl::access::mode::read>(cgh);
2639 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2643 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
2645 cgh.parallel_for<
class compute_inv_diag>(r,
2646 [=](sycl::nd_item<1> item_id)
2648 auto local_id = item_id.get_local_id(0);
2649 auto global_id = item_id.get_global_id(0);
2650 for (
auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
2652 auto block_id = i/pack_size ;
2655 int begin = access_block_row_offset[block_id];
2656 int end = access_block_row_offset[block_id+1];
2657 for(
int k=begin;k<end;++k)
2659 auto kcol = k*pack_size+local_id ;
2660 if((access_cols[kcol])==
int(i) && (access_values[kcol]!=0) )
2661 access_y[i] = 1./access_values[kcol] ;
2685 sycl::nd_range<1> r{sycl::range<1>{total_threads},sycl::range<1>{pack_size}};
2686 cgh.parallel_for<
class compute_inv_diagN>(r,
2687 [=](sycl::nd_item<1> item_id)
2689 auto local_id = item_id.get_local_id(0);
2690 auto global_id = item_id.get_global_id(0);
2691 for (
auto i = global_id; i < nrows; i += item_id.get_global_range()[0])
2693 auto block_id = i/pack_size ;
2695 int begin = access_block_row_offset[block_id];
2696 int end = access_block_row_offset[block_id+1];
2698 for(
int k=begin;k<end;++k)
2700 auto kcol = k*pack_size+local_id ;
2701 std::size_t col = access_cols[kcol] ;
2704 for(
int ieq=0;ieq<N;++ieq)
2706 auto kval = k*NxN + ieq*N + ieq ;
2707 auto diag = access_values[kval*pack_size+local_id] ;
2709 access_y[i*N+ieq] = 1./diag;
2711 access_y[i*N+ieq] = 1.;
2766 template <
typename ValueT,
int EllPackSize>
2767 void MatrixInternal<ValueT, EllPackSize>::computeInvBlockDiag(ValueBufferType& y, sycl::queue& queue)
const
2770 auto device = queue.get_device();
2772 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2774 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2776 auto total_threads = num_groups * ellpack_size;
2780 int NxN = this->m_NxN;
2781 int pack_size = ellpack_size ;
2782 auto nrows = m_profile->getNRows() ;
2783 auto nnz = m_profile->getNnz() ;
2785 assert(y.size()>=NxN*nrows) ;
2787 auto internal_profile = m_profile->internal() ;
2788 auto& kcol = internal_profile->getKCol() ;
2789 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
2790 auto& block_cols = internal_profile->getBlockCols() ;
2792 ValueBufferType buffer (sycl::range<1>(internal_profile->m_block_nrows*NxN*pack_size)) ;
2794 queue.submit([&](sycl::handler& cgh)
2796 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
2797 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
2798 auto matrix = m_values.template get_access<sycl::access::mode::read>(cgh);
2799 auto access_y = y.template get_access<sycl::access::mode::read_write>(cgh);
2800 auto A = buffer.template get_access<sycl::access::mode::read_write>(cgh);
2802 auto lu = LU<
decltype(matrix),
decltype(access_y),
decltype(A)>{N,matrix} ;
2805 cgh.parallel_for<
class compute_inv_block_diag>(
2806 sycl::range<1>{total_threads},
2807 [=] (sycl::item<1> item_id)
2809 auto id = item_id.get_id(0);
2814 for (std::size_t global_id =
id; global_id < nrows; global_id += item_id.get_range()[0])
2816 auto block_id = global_id/pack_size ;
2817 auto local_id = global_id%pack_size ;
2819 int begin = access_block_row_offset[block_id];
2820 int end = access_block_row_offset[block_id+1];
2821 for(
int k=begin;k<end;++k)
2823 auto kcol = k*pack_size+local_id ;
2824 std::size_t col = access_cols[kcol] ;
2827 lu.factorize(global_id,
2833 lu.inverse(global_id,
2895#ifdef PRINT_DEBUG_INFO
2897 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> h_acc(y);
2898 sycl::host_accessor<ValueT, 1, sycl::access::mode::read> mat_acc(m_values);
2899 sycl::host_accessor<int, 1, sycl::access::mode::read> kcol_acc(block_row_offset);
2900 sycl::host_accessor<int, 1, sycl::access::mode::read> cols_acc(block_cols);
2901 std::vector<ValueT> test_y(nrows*NxN) ;
2902 std::vector<ValueT> test_A(internal_profile->m_block_nrows*NxN*pack_size) ;
2903 auto lu = LU<
decltype(mat_acc),
decltype(test_y.data()),
decltype(test_A.data())>{N,mat_acc} ;
2904 for(
int irow=0;irow<nrows;++irow)
2906 auto ib=irow/ellpack_size ;
2907 auto il=irow%ellpack_size ;
2909 cout() <<
"MAT["<<irow<<
","<<ib<<
"]:";
2911 for(
int k=kcol_acc[ib];k<kcol_acc[ib+1];++k)
2914 cout() <<
"\tCOLS["<<k<<
"] = "<<cols_acc[k*pack_size+il];
2923 if(cols_acc[k*pack_size+il]==irow)
2927 cout() <<
"\tCOMPUTE INVERSE["<<cols_acc[k*pack_size+il]<<
"]" ;
2928 for(
int i=0;i<N;++i)
2930 for(
int j=0;j<N;++j)
2931 cout()<<mat_acc[(k*NxN+i*N+j)*pack_size+il]<<
" ";
2947 for(
int i=0;i<N;++i)
2949 for(
int j=0;j<N;++j)
2950 std::cout<<test_y[irow*NxN+i*N+j]<<
",";
2951 std::cout<<std::endl;
2957 for(
int irow=0;irow<nrows;++irow)
2959 auto ib=irow/ellpack_size ;
2960 auto il=irow%ellpack_size ;
2962 cout()<<
"INV BLOCK DIAG["<<irow<<
","<<il<<
"]:\n";
2963 for(
int i=0;i<N;++i)
2965 for(
int j=0;j<N;++j)
2966 cout()<<h_acc[irow*NxN+i*N+j]<<
",";
2976 template <
typename ValueT,
int EllPackSize>
2977 void MatrixInternal<ValueT, EllPackSize>::computeInvDiag(ValueBufferType& y)
const
2979 this->computeInvDiag(y, SYCLEnv::instance()->internal()->queue());
2982 template <
typename ValueT,
int EllPackSize>
2983 void MatrixInternal<ValueT, EllPackSize>::computeInvBlockDiag(ValueBufferType& y)
const
2985 this->computeInvBlockDiag(y, SYCLEnv::instance()->internal()->queue());
2989 template <
typename ValueT,
int EllPackSize>
2990 void MatrixInternal<ValueT, EllPackSize>::scal(ValueBufferType& y, sycl::queue& queue)
2993 auto device = queue.get_device();
2995 auto num_groups = queue.get_device().get_info<sycl::info::device::max_compute_units>();
2997 auto max_work_group_size = queue.get_device().get_info<sycl::info::device::max_work_group_size>();
2999 auto total_threads = num_groups * ellpack_size;
3002 auto nrows = m_profile->getNRows() ;
3003 auto nnz = m_profile->getNnz() ;
3005 auto internal_profile = m_profile->internal() ;
3006 auto& kcol = internal_profile->getKCol() ;
3007 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
3008 auto& block_cols = internal_profile->getBlockCols() ;
3011 queue.submit([&](sycl::handler& cgh)
3013 auto access_block_row_offset = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3014 auto access_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
3015 auto access_values = m_values.template get_access<sycl::access::mode::read_write>(cgh);
3016 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
3019 cgh.parallel_for<
class scal_matrix>(sycl::range<1>{total_threads},
3020 [=] (sycl::item<1> item_id)
3022 auto id = item_id.get_id(0);
3027 for (
auto i =
id; i < nrows; i += item_id.get_range()[0])
3029 auto block_id = i/ellpack_size ;
3030 auto local_id = i%ellpack_size ;
3032 int block_row_offset = access_block_row_offset[block_id]*ellpack_size ;
3033 auto block_row_size = access_block_row_offset[block_id+1]-access_block_row_offset[block_id] ;
3034 for(
int j=0;j<block_row_size;++j)
3036 auto k = block_row_offset+j*ellpack_size+local_id ;
3037 access_values[k] *= access_y[i] ;
3047 auto interface_nrows = m_ext_profile->getNRows();
3048 auto ext_nnz = m_ext_profile->getNnz();
3049 auto ext_block_nnz = m_ext_profile->getBlockNnz();
3051 auto ext_internal_profile = m_ext_profile->internal();
3052 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
3055 queue.submit([&](sycl::handler& cgh)
3057 auto access_block_row_offset = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3058 auto access_block_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
3059 auto access_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
3060 auto access_y = y.template get_access<sycl::access::mode::read>(cgh);
3062 cgh.parallel_for<
class set_matrix_values7>(sycl::range<1>{total_threads},
3063 [=] (sycl::item<1> item_id)
3065 auto id = item_id.get_id(0);
3067 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
3069 auto block_id = i/ellpack_size ;
3070 auto local_id = i%ellpack_size ;
3071 auto begin = access_block_row_offset[block_id] ;
3072 auto end = access_block_row_offset[block_id+1] ;
3074 for(
int k=begin;k<end;++k)
3076 access_block_values[k*ellpack_size+local_id] *= access_y[access_row_ids[i]] ;
3085 template <
typename ValueT,
int EllPackSize>
3086 void MatrixInternal<ValueT, EllPackSize>::scal(ValueBufferType& y)
3088 this->scal(y, SYCLEnv::instance()->internal()->queue());
3091 template <
typename ValueT,
int EllPackSize>
3092 void MatrixInternal<ValueT, EllPackSize>::
3093 copyDevicePointers(
int local_offset,
3099 ValueT* values)
const
3101#ifdef PRINT_DEBUG_INFO
3103 cout()<<
"SYCLMATRIX COPY DEVICE POINTERS : "<<nrows<<
" "<<nnz;
3107 auto env = SYCLEnv::instance() ;
3108 auto max_num_threads = env->maxNumThreads() ;
3109 auto& queue = env->internal()->queue() ;
3110 auto pack_size = ellpack_size;
3112 auto internal_profile = m_profile->internal() ;
3113 auto& kcol = internal_profile->getKCol() ;
3114 auto& block_row_offset = internal_profile->getBlockRowOffset() ;
3115 auto& block_cols = internal_profile->getBlockCols() ;
3117 auto local_row_size = m_profile->localRowSize();
3143 if(local_row_size ==
nullptr)
3146 queue.submit( [&](sycl::handler& cgh)
3148 auto acc_val = m_values.template get_access<sycl::access::mode::read>(cgh);
3149 auto acc_bkcol = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3150 auto acc_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
3151 auto acc_kcol = kcol.template get_access<sycl::access::mode::read>(cgh);
3152 cgh.parallel_for<
class copy_device_ptr>(
3153 sycl::range<1>{max_num_threads},
3154 [=] (sycl::item<1> itemId)
3156 auto id = itemId.get_id(0);
3157 for (
auto i =
id; i < nrows; i += itemId.get_range()[0])
3159 auto block_id = i/pack_size ;
3160 auto local_id = i%pack_size ;
3162 int begin = acc_bkcol[block_id] ;
3163 int end = acc_bkcol[block_id+1] ;
3164 int row_size = acc_kcol[i+1] - acc_kcol[i];
3166 rows[i] = local_offset + i ;
3167 ncols[i] = row_size ;
3169 auto offset = acc_kcol[i];
3171 for(
int k=begin;k<end;++k)
3176 values[offset+j] = acc_val[k*pack_size+local_id] ;
3177 cols[offset+j] = acc_cols[k*pack_size+local_id] ;
3183#ifdef PRINT_DEBUG_INFO
3185 std::vector<ValueT> h_data(nnz) ;
3186 queue.memcpy(h_data.data(), values, nnz *
sizeof(ValueT)).wait();
3187 std::vector<int> h_cols(nnz) ;
3188 queue.memcpy(h_cols.data(), cols, nnz *
sizeof(
int)).wait();
3189 sycl::host_accessor<int, 1, sycl::access::mode::read> h_kcol(kcol);
3191 cout()<<
"SYCL DEVICE COPY : "<<nrows<<
" "<<nnz;
3192 for(
int irow=0;irow<nrows;++irow)
3194 cout()<<
"COPY MAT["<<irow<<
"]:"<<h_kcol[irow+1]-h_kcol[irow];
3195 for(
int k=h_kcol[irow];k<h_kcol[irow+1];++k)
3197 std::cout<<h_cols[k]<<
" "<<h_data[k]<<
" ";
3199 std::cout<<std::endl ;
3208 IndexBufferType lrowsize(local_row_size, sycl::range<1>(nrows));
3209 queue.submit( [&](sycl::handler& cgh)
3211 auto acc_val = m_values.template get_access<sycl::access::mode::read>(cgh);
3212 auto acc_bkcol = block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3213 auto acc_cols = block_cols.template get_access<sycl::access::mode::read>(cgh);
3214 auto acc_kcol = kcol.template get_access<sycl::access::mode::read>(cgh);
3215 auto acc_lrowsize = lrowsize.template get_access<sycl::access::mode::read>(cgh);
3216 cgh.parallel_for<
class copy_device_ptr>(
3217 sycl::range<1>{max_num_threads},
3218 [=] (sycl::item<1> itemId)
3220 auto id = itemId.get_id(0);
3221 for (
auto i =
id; i < nrows; i += itemId.get_range()[0])
3223 auto block_id = i/pack_size ;
3224 auto local_id = i%pack_size ;
3226 int begin = acc_bkcol[block_id] ;
3227 int end = acc_bkcol[block_id+1] ;
3228 int row_size = acc_kcol[i+1] - acc_kcol[i];
3229 int lrow_size = acc_lrowsize[i] ;
3230 rows[i] = local_offset + i ;
3231 ncols[i] = row_size ;
3233 auto offset = acc_kcol[i];
3235 for(
int k=begin;k<end;++k)
3240 values[offset+j] = acc_val[k*pack_size+local_id];
3241 cols[offset+j] = local_offset + acc_cols[k*pack_size+local_id];
3248 assert(m_ext_profile) ;
3250 auto interface_nrows = m_ext_profile->getNRows();
3251 auto ext_nnz = m_ext_profile->getNnz();
3252 auto ext_block_nnz = m_ext_profile->getBlockNnz();
3254 auto ext_internal_profile = m_ext_profile->internal();
3255 auto& ext_block_row_offset = ext_internal_profile->getBlockRowOffset();
3256 auto& ext_block_cols = ext_internal_profile->getBlockCols() ;
3258 queue.submit([&](sycl::handler& cgh)
3260 auto acc_kcol = kcol.template get_access<sycl::access::mode::read>(cgh);
3261 auto acc_ext_bkcol = ext_block_row_offset.template get_access<sycl::access::mode::read>(cgh);
3262 auto acc_ext_cols = ext_block_cols.template get_access<sycl::access::mode::read>(cgh);
3263 auto acc_ext_values = m_ext_values->template get_access<sycl::access::mode::read_write>(cgh);
3264 auto acc_row_ids = m_interface_row_ids->template get_access<sycl::access::mode::read>(cgh);
3265 auto acc_lrowsize = lrowsize.template get_access<sycl::access::mode::read>(cgh);
3266 auto acc_recv_uids = m_recv_uids->template get_access<sycl::access::mode::read>(cgh);
3268 cgh.parallel_for<
class ext_copy_device_ptr>(
3269 sycl::range<1>{max_num_threads},
3270 [=] (sycl::item<1> item_id)
3272 auto id = item_id.get_id(0);
3274 for (
auto i =
id; i < interface_nrows; i += item_id.get_range()[0])
3276 auto block_id = i/pack_size ;
3277 auto local_id = i%pack_size ;
3278 auto begin = acc_ext_bkcol[block_id] ;
3279 auto end = acc_ext_bkcol[block_id+1] ;
3281 auto row_id = acc_row_ids[i];
3282 int row_size = acc_kcol[row_id+1] - acc_kcol[row_id];
3283 int lrow_size = acc_lrowsize[row_id] ;
3284 auto offset = acc_kcol[row_id] + lrow_size;
3286 for(
int k=begin;k<end;++k)
3289 if(j<row_size-lrow_size)
3291 values[offset+j] = acc_ext_values[k*pack_size+local_id] ;
3292 cols[offset+j] = acc_recv_uids[acc_ext_cols[k*pack_size+local_id]] ;
3298#ifdef PRINT_DEBUG_INFO
3300 std::vector<ValueT> h_data(nnz) ;
3301 queue.memcpy(h_data.data(), values, nnz *
sizeof(ValueT)).wait();
3302 std::vector<int> h_cols(nnz) ;
3303 queue.memcpy(h_cols.data(), cols, nnz *
sizeof(
int)).wait();
3304 sycl::host_accessor<int, 1, sycl::access::mode::read> h_kcol(kcol);
3306 cout()<<
"SYCL DEVICE COPY : "<<nrows<<
" "<<nnz<<
" "<<ext_nnz<<
" "<<interface_nrows;
3307 for(
int irow=0;irow<nrows;++irow)
3309 std::cout<<
"COPY MAT["<<irow<<
"]:"<<local_row_size[irow]<<
" "<<h_kcol[irow+1]-h_kcol[irow];
3310 for(
int k=h_kcol[irow];k<h_kcol[irow+1];++k)
3312 std::cout<<h_cols[k]<<
" "<<h_data[k]<<
" ";
3314 std::cout<<std::endl ;
3326template <
typename ValueT>
3329, m_send_policy(SimpleCSRInternal::CommProperty::ASynch)
3330, m_recv_policy(SimpleCSRInternal::CommProperty::ASynch)
3333template <
typename ValueT>
3336, m_send_policy(SimpleCSRInternal::CommProperty::ASynch)
3337, m_recv_policy(SimpleCSRInternal::CommProperty::ASynch)
3340template <
typename ValueT>
3343#ifdef ALIEN_USE_PERF_TIMER
3344 m_timer.printInfo(
"SYCLBELLPACK-MATRIX");
3348template <
typename ValueT>
3350initMatrix(Arccore::MessagePassing::IMessagePassingMng* parallel_mng,
3351 Integer local_offset,
3352 Integer global_size,
3359#ifdef PRINT_DEBUG_INFO
3361 cout()<<
"SYCLBellPackMatrix::initMatrix : "<<local_offset<<
" "<<nrows<<
" "<<global_size;
3362 cout()<<
"ParalleMng : "<<parallel_mng;
3368 m_parallel_mng = parallel_mng;
3370 m_nproc = m_parallel_mng->commSize();
3371 m_myrank = m_parallel_mng->commRank();
3373 m_is_parallel = (m_nproc > 1);
3376 m_matrix_dist_info.copy(matrix_dist_info);
3378 m_ellpack_size = PKSIZE;
3381 m_local_size = nrows;
3382 m_local_offset = local_offset;
3383 m_global_size = global_size;
3385 m_ghost_size = m_matrix_dist_info.m_ghost_nrow;
3387 auto& local_row_size = m_matrix_dist_info.m_local_row_size;
3388 auto sorted_cols = m_matrix_dist_info.m_cols.data();
3390 std::size_t block_nrows = ProfileInternalType::nbBlocks(nrows);
3392 ProfileInternalType::computeBlockRowOffset(m_block_row_offset, nrows, kcol);
3396 cout() <<
"OFFSET = "<<m_local_offset ;
3397 cout() <<
"NROWS = "<<nrows;
3398 cout() <<
"NNZ = "<<kcol[nrows];
3399 cout() <<
"BNROWS = "<<block_nrows;
3400 cout() <<
"BNNZ = "<<m_block_row_offset[block_nrows];
3404 m_profilePKSIZE.reset(
new ProfileInternalType{ nrows,
3407 m_block_row_offset.data(),
3408 local_row_size.data() });
3410 m_matrixPKSIZE.reset(
new MatrixInternalType{ m_profilePKSIZE.get(), block_size });
3413 std::size_t interface_nrows = m_matrix_dist_info.m_interface_nrow;
3414 std::vector<int> ext_kcol(interface_nrows + 1);
3417 for (std::size_t i = 0; i < interface_nrows; ++i) {
3418 auto id = m_matrix_dist_info.m_interface_rows[i];
3419 ext_kcol[i] = ext_nnz;
3420 ext_nnz += kcol[
id + 1] - kcol[id] - local_row_size[id];
3422 ext_kcol[interface_nrows] = ext_nnz;
3424 std::vector<int> ext_cols(ext_nnz);
3427 for (std::size_t i = 0; i < interface_nrows; ++i) {
3428 auto id = m_matrix_dist_info.m_interface_rows[i];
3429 for (
int k = kcol[
id] + local_row_size[
id]; k < kcol[
id + 1]; ++k) {
3430 ext_cols[jcol++] = (int)(sorted_cols[k] - nrows);
3435 std::size_t ext_block_nrows = ProfileInternalType::nbBlocks(interface_nrows);
3436 ProfileInternalType::computeBlockRowOffset(m_ext_block_row_offset, interface_nrows, ext_kcol.data());
3439 cout() <<
"EXT NROWS = "<<interface_nrows;
3440 cout() <<
"EXT NNZ = "<<ext_kcol[interface_nrows];
3441 cout() <<
"EXT BNROWS = "<<ext_block_nrows;
3442 cout() <<
"EXT BNNZ = "<<m_block_row_offset[ext_block_nrows];
3446 m_ext_profilePKSIZE.reset(
new ProfileInternalType{ interface_nrows,
3449 m_ext_block_row_offset.data(),
3452 m_matrixPKSIZE->m_ext_profile = m_ext_profilePKSIZE.get();
3453 typedef typename MatrixInternalType::IndexBufferType IndexBufferType;
3454 m_matrixPKSIZE->m_h_interface_row_ids = m_matrix_dist_info.m_interface_rows.data();
3455 m_matrixPKSIZE->m_interface_row_ids.reset(
new IndexBufferType{ m_matrix_dist_info.m_interface_rows.data(),
3456 m_matrix_dist_info.m_interface_rows.size() });
3458 m_matrixPKSIZE->m_send_ids.reset(
new IndexBufferType{ m_matrix_dist_info.m_send_info.m_ids.data(),
3459 m_matrix_dist_info.m_send_info.m_ids.size() });
3460 m_matrixPKSIZE->m_recv_ids.reset(
new IndexBufferType{ m_matrix_dist_info.m_recv_info.m_ids.data(),
3461 m_matrix_dist_info.m_recv_info.m_ids.size() });
3462 m_matrixPKSIZE->m_recv_uids.reset(
new IndexBufferType{ m_matrix_dist_info.m_recv_info.m_uids.data(),
3463 m_matrix_dist_info.m_recv_info.m_uids.size() });
3469 m_local_size = nrows;
3470 m_global_size = m_local_size;
3474 std::size_t block_nrows = ProfileInternalType::nbBlocks(nrows);
3476 ProfileInternalType::computeBlockRowOffset(m_block_row_offset, nrows, kcol);
3480 cout() <<
"NROWS = "<<nrows;
3481 cout() <<
"NNZ = "<<kcol[nrows];
3482 cout() <<
"BNROWS = "<<block_nrows;
3483 cout() <<
"BNNZ = "<<m_block_row_offset[block_nrows];
3487 m_profilePKSIZE.reset(
new ProfileInternalType{ nrows,
3490 m_block_row_offset.data(),
3493 m_matrixPKSIZE.reset(
new MatrixInternalType{ m_profilePKSIZE.get(), block_size });
3499template <
typename ValueT>
3501SYCLBEllPackMatrix<ValueT>::cloneTo(
const MultiMatrixImpl* multi)
const
3503 auto block_size = blockSize() ;
3505 matrix->setBlockSize(block_size);
3506 matrix->initMatrix(m_parallel_mng,
3509 m_profilePKSIZE->getNRows(),
3510 m_profilePKSIZE->kcol(),
3511 m_profilePKSIZE->cols(),
3514 matrix->setMatrixValues(getAddressData(),
true);
3518template <
typename ValueT>
3519bool SYCLBEllPackMatrix<ValueT>::setMatrixValues(Arccore::Real
const* values,
bool only_host)
3521 return m_matrixPKSIZE->setMatrixValues(values, only_host);
3524template <
typename ValueT>
3525void SYCLBEllPackMatrix<ValueT>::notifyChanges()
3527 if (m_matrixPKSIZE.get())
3528 m_matrixPKSIZE->notifyChanges();
3531template <
typename ValueT>
3532void SYCLBEllPackMatrix<ValueT>::endUpdate()
3534 if (m_matrixPKSIZE.get() && m_matrixPKSIZE->needUpdate()) {
3535 m_matrixPKSIZE->endUpdate();
3536 this->updateTimestamp();
3540template <
typename ValueT>
3541ValueT
const* SYCLBEllPackMatrix<ValueT>::getAddressData()
const
3543 return m_matrixPKSIZE->getHCsrData();
3546template <
typename ValueT>
3547ValueT
const* SYCLBEllPackMatrix<ValueT>::data()
const
3549 return m_matrixPKSIZE->getHCsrData();
3552template <
typename ValueT>
3553ValueT* SYCLBEllPackMatrix<ValueT>::getAddressData()
3555 return m_matrixPKSIZE->getHCsrData();
3558template <
typename ValueT>
3559ValueT* SYCLBEllPackMatrix<ValueT>::data()
3561 return m_matrixPKSIZE->getHCsrData();
3564template <
typename ValueT>
3567 return m_matrixPKSIZE->mult(x.internal()->values(), y.internal()->values());
3570template <
typename ValueT>
3573 m_matrixPKSIZE->addExtMult(x.internal()->ghostValues(getGhostSize()), y.internal()->values());
3576template <
typename ValueT>
3579 m_profilePKSIZE->dcol();
3580 return m_matrixPKSIZE->addLMult(alpha, x.internal()->values(), y.internal()->values());
3583template <
typename ValueT>
3586 m_profilePKSIZE->dcol();
3587 return m_matrixPKSIZE->addUMult(alpha, x.internal()->values(), y.internal()->values());
3590template <
typename ValueT>
3593 return m_matrixPKSIZE->multDiag(x.internal()->values(), y.internal()->values());
3596template <
typename ValueT>
3599 return m_matrixPKSIZE->multDiag(y.internal()->values());
3602template <
typename ValueT>
3605 auto block_size = blockSize() ;
3606 if((y.blockSize()==1)||(y.blockSize()==block_size))
3607 return m_matrixPKSIZE->computeDiag(y.internal()->values());
3608 else if(y.blockSize()==block_size*block_size)
3609 return m_matrixPKSIZE->computeBlockDiag(y.internal()->values());
3611 throw Arccore::FatalErrorException(A_FUNCINFO,
"Matrix and Vector Block Size are not compatibility");
3614template <
typename ValueT>
3617 return m_matrixPKSIZE->multInvDiag(y.internal()->values());
3620template <
typename ValueT>
3623 auto block_size = blockSize() ;
3624 if((y.blockSize()==1)||(y.blockSize()==block_size))
3625 return m_matrixPKSIZE->computeInvDiag(y.internal()->values());
3626 else if(y.blockSize()==block_size*block_size)
3627 return m_matrixPKSIZE->computeInvBlockDiag(y.internal()->values());
3629 throw Arccore::FatalErrorException(A_FUNCINFO,
"Matrix and Vector Block Size are not compatibility");
3632template <
typename ValueT>
3635 return m_matrixPKSIZE->scal(diag.internal()->values());
3639template <
typename ValueT>
3642 auto block_size = blockSize() ;
3643 initMatrix(matrix.m_parallel_mng,
3644 matrix.m_local_offset,
3645 matrix.m_global_size,
3646 matrix.m_profilePKSIZE->getNRows(),
3647 matrix.m_profilePKSIZE->kcol(),
3648 matrix.m_profilePKSIZE->cols(),
3649 matrix.m_matrix_dist_info,
3651 if(block_size==matrix.blockSize())
3654 m_matrixPKSIZE->setMatrixValues(matrix.m_matrixPKSIZE->m_values,
3655 (*matrix.m_matrixPKSIZE->m_ext_values));
3657 m_matrixPKSIZE->setMatrixValues(matrix.m_matrixPKSIZE->m_values);
3661 assert(m_profilePKSIZE.get());
3662 assert(m_matrixPKSIZE.get());
3663 auto nb_blocks = m_profilePKSIZE->getBlockNnz();
3665 m_matrixPKSIZE->copy(nb_blocks,
3667 matrix.m_matrixPKSIZE->m_values,
3668 (*matrix.m_matrixPKSIZE->m_ext_values),
3669 matrix.blockSize()) ;
3671 m_matrixPKSIZE->copy(nb_blocks,
3673 matrix.m_matrixPKSIZE->m_values,
3674 matrix.blockSize()) ;
3678template <
typename ValueT>
3685 ValueT** values)
const
3687 auto env = SYCLEnv::instance() ;
3688 auto max_num_treads = env->maxNumThreads() ;
3689 auto& queue = env->internal()->queue() ;
3690 auto rows_ptr = malloc_device<IndexType>(nrows, queue);
3691 auto ncols_ptr = malloc_device<IndexType>(nrows, queue);
3692 auto cols_ptr = malloc_device<IndexType>(nnz, queue);
3693 auto values_ptr = malloc_device<ValueT>(nnz, queue);
3696 *ncols = ncols_ptr ;
3698 *values = values_ptr ;
3701template <
typename ValueT>
3705 auto env = SYCLEnv::instance() ;
3706 auto& queue = env->internal()->queue() ;
3707 sycl::free(rows,queue) ;
3708 sycl::free(ncols,queue) ;
3709 sycl::free(cols,queue) ;
3710 sycl::free(values,queue) ;
3714template <
typename ValueT>
3721 ValueT* values)
const
3723 m_matrixPKSIZE->copyDevicePointers(m_local_offset,nrows,nnz,rows,ncols,cols,values) ;
3727template <
typename ValueT>
3728SYCLBEllPackMatrix<ValueT>::HCSRView
3729SYCLBEllPackMatrix<ValueT>::hcsrView(BackEnd::Memory::eType memory,
int nrows,
int nnz)
const
3731 return HCSRView(
this,memory, nrows, nnz) ;
3742void force_int_instance()
3745 SYCLBEllPackMatrix<double>::MatrixInternalType::ValueBufferType buffer{sycl::range<1>(10)} ;
3746 matrix.internal()->setMatrixValues(buffer) ;
IMatrixImpl(const MultiMatrixImpl *multi_impl, BackEndId backend="")
Constructor.
Multi matrices representation container.
virtual ~SYCLBEllPackMatrix()
-- tab-width: 2; indent-tabs-mode: nil; coding: utf-8-with-signature --